mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-10 15:44:44 +00:00
vk: Tuning [WIP]
- Unroll main compute queue loop - Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons - Enable dynamic SSBO indexing (affects AMD) - Make loop unrolling and loop length variable depending on hardware and find optimum
This commit is contained in:
parent
d484253136
commit
bda65f93a6
@ -16,7 +16,9 @@ namespace vk
|
|||||||
u32 m_used_descriptors = 0;
|
u32 m_used_descriptors = 0;
|
||||||
|
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
u32 optimal_group_size = 64;
|
bool unroll_loops = true;
|
||||||
|
u32 optimal_group_size = 1;
|
||||||
|
u32 optimal_kernel_size = 1;
|
||||||
|
|
||||||
void init_descriptors()
|
void init_descriptors()
|
||||||
{
|
{
|
||||||
@ -62,7 +64,15 @@ namespace vk
|
|||||||
case vk::driver_vendor::unknown:
|
case vk::driver_vendor::unknown:
|
||||||
// Probably intel
|
// Probably intel
|
||||||
case vk::driver_vendor::NVIDIA:
|
case vk::driver_vendor::NVIDIA:
|
||||||
|
unroll_loops = true;
|
||||||
optimal_group_size = 32;
|
optimal_group_size = 32;
|
||||||
|
optimal_kernel_size = 16;
|
||||||
|
break;
|
||||||
|
case vk::driver_vendor::AMD:
|
||||||
|
case vk::driver_vendor::RADV:
|
||||||
|
unroll_loops = false;
|
||||||
|
optimal_kernel_size = 1;
|
||||||
|
optimal_group_size = 64;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,9 +165,12 @@ namespace vk
|
|||||||
u32 m_data_length = 0;
|
u32 m_data_length = 0;
|
||||||
u32 kernel_size = 1;
|
u32 kernel_size = 1;
|
||||||
|
|
||||||
void build(const char* function_name, u32 _kernel_size)
|
void build(const char* function_name, u32 _kernel_size = 0)
|
||||||
{
|
{
|
||||||
kernel_size = _kernel_size;
|
// Initialize to allow detecting optimal settings
|
||||||
|
create();
|
||||||
|
|
||||||
|
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||||
|
|
||||||
m_src =
|
m_src =
|
||||||
{
|
{
|
||||||
@ -180,12 +193,23 @@ namespace vk
|
|||||||
"void main()\n"
|
"void main()\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
|
" uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
|
||||||
" for (uint loop = 0; loop < KERNEL_SIZE; ++loop)\n"
|
" uint value;\n"
|
||||||
" {\n"
|
"\n"
|
||||||
" uint value = data[index];\n"
|
};
|
||||||
|
|
||||||
|
std::string work_kernel =
|
||||||
|
{
|
||||||
|
" value = data[index];\n"
|
||||||
" data[index] = %f(value);\n"
|
" data[index] = %f(value);\n"
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string loop_advance =
|
||||||
|
{
|
||||||
" index++;\n"
|
" index++;\n"
|
||||||
" }\n"
|
};
|
||||||
|
|
||||||
|
const std::string suffix =
|
||||||
|
{
|
||||||
"}\n"
|
"}\n"
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -197,6 +221,40 @@ namespace vk
|
|||||||
};
|
};
|
||||||
|
|
||||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||||
|
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||||
|
|
||||||
|
if (kernel_size <= 1)
|
||||||
|
{
|
||||||
|
m_src += " {\n" + work_kernel + " }\n";
|
||||||
|
}
|
||||||
|
else if (unroll_loops)
|
||||||
|
{
|
||||||
|
work_kernel += loop_advance + "\n";
|
||||||
|
|
||||||
|
m_src += std::string
|
||||||
|
(
|
||||||
|
" //Unrolled loop\n"
|
||||||
|
" {\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||||
|
for (u32 n = 0; n < kernel_size; ++n)
|
||||||
|
{
|
||||||
|
m_src += work_kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||||
|
m_src += " {\n";
|
||||||
|
m_src += work_kernel;
|
||||||
|
m_src += loop_advance;
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
void bind_resources() override
|
void bind_resources() override
|
||||||
@ -221,7 +279,7 @@ namespace vk
|
|||||||
// byteswap ushort
|
// byteswap ushort
|
||||||
cs_shuffle_16()
|
cs_shuffle_16()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("bswap_u16", 32);
|
cs_shuffle_base::build("bswap_u16");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -230,7 +288,7 @@ namespace vk
|
|||||||
// byteswap_ulong
|
// byteswap_ulong
|
||||||
cs_shuffle_32()
|
cs_shuffle_32()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("bswap_u32", 32);
|
cs_shuffle_base::build("bswap_u32");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -239,7 +297,7 @@ namespace vk
|
|||||||
// byteswap_ulong + byteswap_ushort
|
// byteswap_ulong + byteswap_ushort
|
||||||
cs_shuffle_32_16()
|
cs_shuffle_32_16()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("bswap_u16_u32", 32);
|
cs_shuffle_base::build("bswap_u16_u32");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -248,7 +306,7 @@ namespace vk
|
|||||||
// convert d24x8 to f32
|
// convert d24x8 to f32
|
||||||
cs_shuffle_d24x8_f32()
|
cs_shuffle_d24x8_f32()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("d24x8_to_f32", 32);
|
cs_shuffle_base::build("d24x8_to_f32");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -257,7 +315,7 @@ namespace vk
|
|||||||
// convert f32 to d24x8 and swap endianness
|
// convert f32 to d24x8 and swap endianness
|
||||||
cs_shuffle_se_f32_d24x8()
|
cs_shuffle_se_f32_d24x8()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("f32_to_d24x8_swapped", 32);
|
cs_shuffle_base::build("f32_to_d24x8_swapped");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -266,7 +324,7 @@ namespace vk
|
|||||||
// swap endianness of d24x8
|
// swap endianness of d24x8
|
||||||
cs_shuffle_se_d24x8()
|
cs_shuffle_se_d24x8()
|
||||||
{
|
{
|
||||||
cs_shuffle_base::build("d24x8_to_d24x8_swapped", 32);
|
cs_shuffle_base::build("d24x8_to_d24x8_swapped");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -472,11 +472,13 @@ namespace vk
|
|||||||
//Currently we require:
|
//Currently we require:
|
||||||
//1. Anisotropic sampling
|
//1. Anisotropic sampling
|
||||||
//2. DXT support
|
//2. DXT support
|
||||||
|
//3. Indexable storage buffers
|
||||||
VkPhysicalDeviceFeatures available_features;
|
VkPhysicalDeviceFeatures available_features;
|
||||||
vkGetPhysicalDeviceFeatures(*pgpu, &available_features);
|
vkGetPhysicalDeviceFeatures(*pgpu, &available_features);
|
||||||
|
|
||||||
available_features.samplerAnisotropy = VK_TRUE;
|
available_features.samplerAnisotropy = VK_TRUE;
|
||||||
available_features.textureCompressionBC = VK_TRUE;
|
available_features.textureCompressionBC = VK_TRUE;
|
||||||
|
available_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE;
|
||||||
|
|
||||||
VkDeviceCreateInfo device = {};
|
VkDeviceCreateInfo device = {};
|
||||||
device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
|
device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
|
||||||
|
@ -445,6 +445,7 @@ namespace vk
|
|||||||
size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8);
|
size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8);
|
||||||
void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8);
|
void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8);
|
||||||
void *dst = mapped_buffer;
|
void *dst = mapped_buffer;
|
||||||
|
VkBuffer buffer_handle = upload_heap.heap->value;
|
||||||
|
|
||||||
if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT)
|
if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT)
|
||||||
{
|
{
|
||||||
@ -466,10 +467,26 @@ namespace vk
|
|||||||
// NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec
|
// NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec
|
||||||
// No need to add another explicit barrier unless a driver bug is found
|
// No need to add another explicit barrier unless a driver bug is found
|
||||||
|
|
||||||
|
// Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead
|
||||||
|
auto scratch_buf = vk::get_scratch_buffer();
|
||||||
|
|
||||||
|
VkBufferCopy copy = {};
|
||||||
|
copy.srcOffset = offset_in_buffer;
|
||||||
|
copy.dstOffset = 0;
|
||||||
|
copy.size = image_linear_size;
|
||||||
|
|
||||||
|
vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, ©);
|
||||||
|
|
||||||
|
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||||
|
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||||
|
|
||||||
vk::get_compute_task<vk::cs_shuffle_d24x8_f32>()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer);
|
vk::get_compute_task<vk::cs_shuffle_d24x8_f32>()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer);
|
||||||
|
|
||||||
insert_buffer_memory_barrier(cmd, upload_heap.heap->value, offset_in_buffer, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||||
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||||
|
|
||||||
|
buffer_handle = scratch_buf->value;
|
||||||
|
offset_in_buffer = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
VkBufferImageCopy copy_info = {};
|
VkBufferImageCopy copy_info = {};
|
||||||
@ -483,7 +500,7 @@ namespace vk
|
|||||||
copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count;
|
copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count;
|
||||||
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
|
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
|
||||||
|
|
||||||
vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info);
|
vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info);
|
||||||
mipmap_level++;
|
mipmap_level++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -199,6 +199,8 @@ namespace vk
|
|||||||
{
|
{
|
||||||
// TODO: Synchronize access to typeles textures
|
// TODO: Synchronize access to typeles textures
|
||||||
target = vk::get_typeless_helper(vram_texture->info.format);
|
target = vk::get_typeless_helper(vram_texture->info.format);
|
||||||
|
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, subresource_range);
|
||||||
|
|
||||||
vk::copy_scaled_image(cmd, vram_texture->value, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->current_layout,
|
vk::copy_scaled_image(cmd, vram_texture->value, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->current_layout,
|
||||||
0, 0, vram_texture->width(), vram_texture->height(), 0, 0, transfer_width, transfer_height, 1, aspect_flag, true, VK_FILTER_NEAREST,
|
0, 0, vram_texture->width(), vram_texture->height(), 0, 0, transfer_width, transfer_height, 1, aspect_flag, true, VK_FILTER_NEAREST,
|
||||||
vram_texture->info.format, target->info.format);
|
vram_texture->info.format, target->info.format);
|
||||||
@ -212,15 +214,6 @@ namespace vk
|
|||||||
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range);
|
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Read back stencil values (is this really necessary?)
|
|
||||||
VkBufferImageCopy region = {};
|
|
||||||
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
|
|
||||||
region.imageExtent = {transfer_width, transfer_height, 1};
|
|
||||||
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, ®ion);
|
|
||||||
|
|
||||||
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
|
|
||||||
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
|
|
||||||
|
|
||||||
// Handle any format conversions using compute tasks
|
// Handle any format conversions using compute tasks
|
||||||
vk::cs_shuffle_base *shuffle_kernel = nullptr;
|
vk::cs_shuffle_base *shuffle_kernel = nullptr;
|
||||||
|
|
||||||
@ -247,13 +240,35 @@ namespace vk
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Do not run the compute task on host visible memory
|
||||||
|
vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get();
|
||||||
|
|
||||||
|
// TODO: Read back stencil values (is this really necessary?)
|
||||||
|
VkBufferImageCopy region = {};
|
||||||
|
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
|
||||||
|
region.imageExtent = {transfer_width, transfer_height, 1};
|
||||||
|
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, ®ion);
|
||||||
|
|
||||||
|
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
|
||||||
|
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
|
||||||
|
|
||||||
if (shuffle_kernel)
|
if (shuffle_kernel)
|
||||||
{
|
{
|
||||||
vk::insert_buffer_memory_barrier(cmd, dma_buffer->value, 0, cpu_address_range,
|
verify (HERE), mem_target->value != dma_buffer->value;
|
||||||
|
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
|
||||||
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||||
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||||
|
|
||||||
shuffle_kernel->run(cmd, dma_buffer.get(), cpu_address_range);
|
shuffle_kernel->run(cmd, mem_target, cpu_address_range);
|
||||||
|
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
|
||||||
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||||
|
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||||
|
|
||||||
|
VkBufferCopy copy = {};
|
||||||
|
copy.size = cpu_address_range;
|
||||||
|
vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, ©);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (manage_cb_lifetime)
|
if (manage_cb_lifetime)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user