vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor. They already execute properly even when going beyond the limits but this removes validation noise. - Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.
2025-03-14 01:27:00 +00:00 · 2019-11-05 17:03:25 +03:00 · 2019-11-05 17:03:25 +03:00 · 508ffcb775
commit 508ffcb775
parent 99d71fdc2a
1 changed files with 24 additions and 7 deletions
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -26,6 +26,7 @@ namespace vk
 		u32 push_constants_size = 0;
 		u32 optimal_group_size = 1;
 		u32 optimal_kernel_size = 1;
+		u32 max_invocations_x = 65535;

 		virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
 		{
@ -116,6 +117,9 @@ namespace vk
 					break;
 				}

+				const auto& gpu = vk::get_current_renderer()->gpu();
+				max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
+
 				initialized = true;
 			}
 		}
@ -203,7 +207,23 @@ namespace vk

 		virtual void run(VkCommandBuffer cmd, u32 num_invocations)
 		{
-			run(cmd, num_invocations, 1, 1);
+			u32 invocations_x, invocations_y;
+			if (num_invocations > max_invocations_x)
+			{
+				// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
+				// Split the 1D job into 2 dimensions to accomodate this
+				invocations_x = (u32)floor(std::sqrt(num_invocations));
+				invocations_y = invocations_x;
+
+				if (num_invocations % invocations_x) invocations_y++;
+			}
+			else
+			{
+				invocations_x = num_invocations;
+				invocations_y = 1;
+			}
+
+			run(cmd, invocations_x, invocations_y, 1);
 		}
 	};

@ -215,14 +235,10 @@ namespace vk
 		u32 kernel_size = 1;

 		std::string variables, work_kernel, loop_advance, suffix;
-		std::string index_declaration;
 		std::string method_declarations;

 		cs_shuffle_base()
 		{
-			index_declaration =
-				"gl_GlobalInvocationID.x * KERNEL_SIZE";
-
 			work_kernel =
 				"		value = data[index];\n"
 				"		data[index] = %f(value);\n";
@ -263,7 +279,8 @@ namespace vk
 				"\n"
 				"void main()\n"
 				"{\n"
-				"	uint index = gl_GlobalInvocationID.x;\n"
+				"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
+				"	uint index = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
 				"	uint value;\n"
 				"	%vars"
 				"\n";
@ -761,7 +778,7 @@ namespace vk
 			params.logd = rsx::ceil_log2(depth);
 			set_parameters(cmd);

-			const u32 num_bytes_per_invocation = (4 * optimal_group_size);
+			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
 			const u32 linear_invocations = rsx::aligned_div(data_length, num_bytes_per_invocation);
 			compute_task::run(cmd, linear_invocations);
 		}