vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.
This commit is contained in:
kd-11 2019-11-05 17:03:25 +03:00 committed by kd-11
parent 99d71fdc2a
commit 508ffcb775

View File

@ -26,6 +26,7 @@ namespace vk
u32 push_constants_size = 0; u32 push_constants_size = 0;
u32 optimal_group_size = 1; u32 optimal_group_size = 1;
u32 optimal_kernel_size = 1; u32 optimal_kernel_size = 1;
u32 max_invocations_x = 65535;
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout() virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
{ {
@ -116,6 +117,9 @@ namespace vk
break; break;
} }
const auto& gpu = vk::get_current_renderer()->gpu();
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
initialized = true; initialized = true;
} }
} }
@ -203,7 +207,23 @@ namespace vk
virtual void run(VkCommandBuffer cmd, u32 num_invocations) virtual void run(VkCommandBuffer cmd, u32 num_invocations)
{ {
run(cmd, num_invocations, 1, 1); u32 invocations_x, invocations_y;
if (num_invocations > max_invocations_x)
{
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
// Split the 1D job into 2 dimensions to accomodate this
invocations_x = (u32)floor(std::sqrt(num_invocations));
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
else
{
invocations_x = num_invocations;
invocations_y = 1;
}
run(cmd, invocations_x, invocations_y, 1);
} }
}; };
@ -215,14 +235,10 @@ namespace vk
u32 kernel_size = 1; u32 kernel_size = 1;
std::string variables, work_kernel, loop_advance, suffix; std::string variables, work_kernel, loop_advance, suffix;
std::string index_declaration;
std::string method_declarations; std::string method_declarations;
cs_shuffle_base() cs_shuffle_base()
{ {
index_declaration =
"gl_GlobalInvocationID.x * KERNEL_SIZE";
work_kernel = work_kernel =
" value = data[index];\n" " value = data[index];\n"
" data[index] = %f(value);\n"; " data[index] = %f(value);\n";
@ -263,7 +279,8 @@ namespace vk
"\n" "\n"
"void main()\n" "void main()\n"
"{\n" "{\n"
" uint index = gl_GlobalInvocationID.x;\n" " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
" uint index = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
" uint value;\n" " uint value;\n"
" %vars" " %vars"
"\n"; "\n";
@ -761,7 +778,7 @@ namespace vk
params.logd = rsx::ceil_log2(depth); params.logd = rsx::ceil_log2(depth);
set_parameters(cmd); set_parameters(cmd);
const u32 num_bytes_per_invocation = (4 * optimal_group_size); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = rsx::aligned_div(data_length, num_bytes_per_invocation); const u32 linear_invocations = rsx::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, linear_invocations);
} }