mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-28 08:37:06 +00:00
vk: Tuning [WIP]
- Unroll main compute queue loop - Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons - Enable dynamic SSBO indexing (affects AMD) - Make loop unrolling and loop length variable depending on hardware and find optimum
This commit is contained in:
parent
d484253136
commit
bda65f93a6
@ -16,7 +16,9 @@ namespace vk
|
||||
u32 m_used_descriptors = 0;
|
||||
|
||||
bool initialized = false;
|
||||
u32 optimal_group_size = 64;
|
||||
bool unroll_loops = true;
|
||||
u32 optimal_group_size = 1;
|
||||
u32 optimal_kernel_size = 1;
|
||||
|
||||
void init_descriptors()
|
||||
{
|
||||
@ -62,7 +64,15 @@ namespace vk
|
||||
case vk::driver_vendor::unknown:
|
||||
// Probably intel
|
||||
case vk::driver_vendor::NVIDIA:
|
||||
unroll_loops = true;
|
||||
optimal_group_size = 32;
|
||||
optimal_kernel_size = 16;
|
||||
break;
|
||||
case vk::driver_vendor::AMD:
|
||||
case vk::driver_vendor::RADV:
|
||||
unroll_loops = false;
|
||||
optimal_kernel_size = 1;
|
||||
optimal_group_size = 64;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -155,9 +165,12 @@ namespace vk
|
||||
u32 m_data_length = 0;
|
||||
u32 kernel_size = 1;
|
||||
|
||||
void build(const char* function_name, u32 _kernel_size)
|
||||
void build(const char* function_name, u32 _kernel_size = 0)
|
||||
{
|
||||
kernel_size = _kernel_size;
|
||||
// Initialize to allow detecting optimal settings
|
||||
create();
|
||||
|
||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||
|
||||
m_src =
|
||||
{
|
||||
@ -180,12 +193,23 @@ namespace vk
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
|
||||
" for (uint loop = 0; loop < KERNEL_SIZE; ++loop)\n"
|
||||
" {\n"
|
||||
" uint value = data[index];\n"
|
||||
" uint value;\n"
|
||||
"\n"
|
||||
};
|
||||
|
||||
std::string work_kernel =
|
||||
{
|
||||
" value = data[index];\n"
|
||||
" data[index] = %f(value);\n"
|
||||
};
|
||||
|
||||
std::string loop_advance =
|
||||
{
|
||||
" index++;\n"
|
||||
" }\n"
|
||||
};
|
||||
|
||||
const std::string suffix =
|
||||
{
|
||||
"}\n"
|
||||
};
|
||||
|
||||
@ -197,6 +221,40 @@ namespace vk
|
||||
};
|
||||
|
||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||
|
||||
if (kernel_size <= 1)
|
||||
{
|
||||
m_src += " {\n" + work_kernel + " }\n";
|
||||
}
|
||||
else if (unroll_loops)
|
||||
{
|
||||
work_kernel += loop_advance + "\n";
|
||||
|
||||
m_src += std::string
|
||||
(
|
||||
" //Unrolled loop\n"
|
||||
" {\n"
|
||||
);
|
||||
|
||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||
for (u32 n = 0; n < kernel_size; ++n)
|
||||
{
|
||||
m_src += work_kernel;
|
||||
}
|
||||
|
||||
m_src += " }\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||
m_src += " {\n";
|
||||
m_src += work_kernel;
|
||||
m_src += loop_advance;
|
||||
m_src += " }\n";
|
||||
}
|
||||
|
||||
m_src += suffix;
|
||||
}
|
||||
|
||||
void bind_resources() override
|
||||
@ -221,7 +279,7 @@ namespace vk
|
||||
// byteswap ushort
|
||||
cs_shuffle_16()
|
||||
{
|
||||
cs_shuffle_base::build("bswap_u16", 32);
|
||||
cs_shuffle_base::build("bswap_u16");
|
||||
}
|
||||
};
|
||||
|
||||
@ -230,7 +288,7 @@ namespace vk
|
||||
// byteswap_ulong
|
||||
cs_shuffle_32()
|
||||
{
|
||||
cs_shuffle_base::build("bswap_u32", 32);
|
||||
cs_shuffle_base::build("bswap_u32");
|
||||
}
|
||||
};
|
||||
|
||||
@ -239,7 +297,7 @@ namespace vk
|
||||
// byteswap_ulong + byteswap_ushort
|
||||
cs_shuffle_32_16()
|
||||
{
|
||||
cs_shuffle_base::build("bswap_u16_u32", 32);
|
||||
cs_shuffle_base::build("bswap_u16_u32");
|
||||
}
|
||||
};
|
||||
|
||||
@ -248,7 +306,7 @@ namespace vk
|
||||
// convert d24x8 to f32
|
||||
cs_shuffle_d24x8_f32()
|
||||
{
|
||||
cs_shuffle_base::build("d24x8_to_f32", 32);
|
||||
cs_shuffle_base::build("d24x8_to_f32");
|
||||
}
|
||||
};
|
||||
|
||||
@ -257,7 +315,7 @@ namespace vk
|
||||
// convert f32 to d24x8 and swap endianness
|
||||
cs_shuffle_se_f32_d24x8()
|
||||
{
|
||||
cs_shuffle_base::build("f32_to_d24x8_swapped", 32);
|
||||
cs_shuffle_base::build("f32_to_d24x8_swapped");
|
||||
}
|
||||
};
|
||||
|
||||
@ -266,7 +324,7 @@ namespace vk
|
||||
// swap endianness of d24x8
|
||||
cs_shuffle_se_d24x8()
|
||||
{
|
||||
cs_shuffle_base::build("d24x8_to_d24x8_swapped", 32);
|
||||
cs_shuffle_base::build("d24x8_to_d24x8_swapped");
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -472,11 +472,13 @@ namespace vk
|
||||
//Currently we require:
|
||||
//1. Anisotropic sampling
|
||||
//2. DXT support
|
||||
//3. Indexable storage buffers
|
||||
VkPhysicalDeviceFeatures available_features;
|
||||
vkGetPhysicalDeviceFeatures(*pgpu, &available_features);
|
||||
|
||||
available_features.samplerAnisotropy = VK_TRUE;
|
||||
available_features.textureCompressionBC = VK_TRUE;
|
||||
available_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE;
|
||||
|
||||
VkDeviceCreateInfo device = {};
|
||||
device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
|
||||
|
@ -445,6 +445,7 @@ namespace vk
|
||||
size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8);
|
||||
void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8);
|
||||
void *dst = mapped_buffer;
|
||||
VkBuffer buffer_handle = upload_heap.heap->value;
|
||||
|
||||
if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT)
|
||||
{
|
||||
@ -466,10 +467,26 @@ namespace vk
|
||||
// NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec
|
||||
// No need to add another explicit barrier unless a driver bug is found
|
||||
|
||||
// Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead
|
||||
auto scratch_buf = vk::get_scratch_buffer();
|
||||
|
||||
VkBufferCopy copy = {};
|
||||
copy.srcOffset = offset_in_buffer;
|
||||
copy.dstOffset = 0;
|
||||
copy.size = image_linear_size;
|
||||
|
||||
vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, ©);
|
||||
|
||||
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||
|
||||
vk::get_compute_task<vk::cs_shuffle_d24x8_f32>()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer);
|
||||
|
||||
insert_buffer_memory_barrier(cmd, upload_heap.heap->value, offset_in_buffer, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||
|
||||
buffer_handle = scratch_buf->value;
|
||||
offset_in_buffer = 0;
|
||||
}
|
||||
|
||||
VkBufferImageCopy copy_info = {};
|
||||
@ -483,7 +500,7 @@ namespace vk
|
||||
copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count;
|
||||
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
|
||||
|
||||
vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info);
|
||||
vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info);
|
||||
mipmap_level++;
|
||||
}
|
||||
}
|
||||
|
@ -199,6 +199,8 @@ namespace vk
|
||||
{
|
||||
// TODO: Synchronize access to typeles textures
|
||||
target = vk::get_typeless_helper(vram_texture->info.format);
|
||||
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, subresource_range);
|
||||
|
||||
vk::copy_scaled_image(cmd, vram_texture->value, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->current_layout,
|
||||
0, 0, vram_texture->width(), vram_texture->height(), 0, 0, transfer_width, transfer_height, 1, aspect_flag, true, VK_FILTER_NEAREST,
|
||||
vram_texture->info.format, target->info.format);
|
||||
@ -212,15 +214,6 @@ namespace vk
|
||||
change_image_layout(cmd, target, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range);
|
||||
}
|
||||
|
||||
// TODO: Read back stencil values (is this really necessary?)
|
||||
VkBufferImageCopy region = {};
|
||||
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
|
||||
region.imageExtent = {transfer_width, transfer_height, 1};
|
||||
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, ®ion);
|
||||
|
||||
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
|
||||
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
|
||||
|
||||
// Handle any format conversions using compute tasks
|
||||
vk::cs_shuffle_base *shuffle_kernel = nullptr;
|
||||
|
||||
@ -247,13 +240,35 @@ namespace vk
|
||||
}
|
||||
}
|
||||
|
||||
// Do not run the compute task on host visible memory
|
||||
vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get();
|
||||
|
||||
// TODO: Read back stencil values (is this really necessary?)
|
||||
VkBufferImageCopy region = {};
|
||||
region.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
|
||||
region.imageExtent = {transfer_width, transfer_height, 1};
|
||||
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, ®ion);
|
||||
|
||||
change_image_layout(cmd, vram_texture, old_layout, subresource_range);
|
||||
real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width;
|
||||
|
||||
if (shuffle_kernel)
|
||||
{
|
||||
vk::insert_buffer_memory_barrier(cmd, dma_buffer->value, 0, cpu_address_range,
|
||||
verify (HERE), mem_target->value != dma_buffer->value;
|
||||
|
||||
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||
|
||||
shuffle_kernel->run(cmd, dma_buffer.get(), cpu_address_range);
|
||||
shuffle_kernel->run(cmd, mem_target, cpu_address_range);
|
||||
|
||||
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, cpu_address_range,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||
|
||||
VkBufferCopy copy = {};
|
||||
copy.size = cpu_address_range;
|
||||
vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, ©);
|
||||
}
|
||||
|
||||
if (manage_cb_lifetime)
|
||||
|
Loading…
x
Reference in New Issue
Block a user