From 5fb4009a0781f9c4965ba492c03106562a1f829b Mon Sep 17 00:00:00 2001 From: kd-11 Date: Fri, 22 Jun 2018 22:09:20 +0300 Subject: [PATCH] vk; Add more compute routines to handle texture format conversions - Implement le D24x8 to le D32 upload routine - Implement endianness swapping and depth format conversions routines (readback) --- rpcs3/Emu/RSX/VK/VKCompute.h | 50 +++++++-- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 6 +- rpcs3/Emu/RSX/VK/VKHelpers.h | 3 +- rpcs3/Emu/RSX/VK/VKTexture.cpp | 64 ++++------- rpcs3/Emu/RSX/VK/VKTextureCache.h | 169 +++++++----------------------- 5 files changed, 109 insertions(+), 183 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index f2678620f3..bceba61a28 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -97,7 +97,7 @@ namespace vk virtual void bind_resources() {} - void load_program(const vk::command_buffer& cmd) + void load_program(VkCommandBuffer cmd) { if (!m_program) { @@ -141,7 +141,7 @@ namespace vk vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr); } - virtual void run(const vk::command_buffer& cmd, u32 num_invocations) + virtual void run(VkCommandBuffer cmd, u32 num_invocations) { load_program(cmd); vkCmdDispatch(cmd, num_invocations, 1, 1); @@ -151,6 +151,8 @@ namespace vk struct cs_shuffle_base : compute_task { vk::buffer* m_data; + u32 m_data_offset = 0; + u32 m_data_length = 0; u32 kernel_size = 1; void build(const char* function_name, u32 _kernel_size) @@ -164,10 +166,17 @@ namespace vk "layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n\n" "\n" "#define KERNEL_SIZE %ks\n" + "\n" + "// Generic swap routines\n" "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" "\n" + "// Depth format conversions\n" + "#define d24x8_to_f32(bits) floatBitsToUint(float(bits >> 8) / 16777214.f)\n" + "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00FF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" + "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(uint(uintBitsToFloat(bits) * 16777214.f))\n" + "\n" "void main()\n" "{\n" " uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n" @@ -192,23 +201,23 @@ namespace vk void bind_resources() override { - m_program->bind_buffer({ m_data->value, 0, VK_WHOLE_SIZE }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); } - void run(const vk::command_buffer& cmd, vk::buffer* data, u32 mem_size) + void run(VkCommandBuffer cmd, vk::buffer* data, u32 data_length, u32 data_offset = 0) { m_data = data; + m_data_offset = data_offset; + m_data_length = data_length; const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; - const auto num_invocations = align(mem_size, 256) / num_bytes_per_invocation; + const auto num_invocations = align(data_length, 256) / num_bytes_per_invocation; compute_task::run(cmd, num_invocations); } }; struct cs_shuffle_16 : cs_shuffle_base { - vk::buffer* m_data; - // byteswap ushort cs_shuffle_16() { @@ -234,6 +243,33 @@ namespace vk } }; + struct cs_shuffle_d24x8_f32 : cs_shuffle_base + { + // convert d24x8 to f32 + cs_shuffle_d24x8_f32() + { + cs_shuffle_base::build("d24x8_to_f32", 32); + } + }; + + struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base + { + // convert f32 to d24x8 and swap endianness + cs_shuffle_se_f32_d24x8() + { + cs_shuffle_base::build("f32_to_d24x8_swapped", 32); + } + }; + + struct cs_shuffle_se_d24x8 : cs_shuffle_base + { + // swap endianness of d24x8 + cs_shuffle_se_d24x8() + { + cs_shuffle_base::build("d24x8_to_d24x8_swapped", 32); + } + }; + // TODO: Replace with a proper manager extern std::unordered_map> g_compute_tasks; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index e9dbc0885e..47c0a568e1 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -347,13 +347,13 @@ namespace vk return g_drv_disable_fence_reset; } - void insert_buffer_memory_barrier(VkCommandBuffer cmd, VkBuffer buffer, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src_mask, VkAccessFlags dst_mask) + void insert_buffer_memory_barrier(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize length, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src_mask, VkAccessFlags dst_mask) { VkBufferMemoryBarrier barrier = {}; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = VK_WHOLE_SIZE; + barrier.offset = offset; + barrier.size = length; barrier.srcAccessMask = src_mask; barrier.dstAccessMask = dst_mask; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index a9233ae1d3..40fba6c272 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -149,7 +149,8 @@ namespace vk void insert_texture_barrier(VkCommandBuffer cmd, VkImage image, VkImageLayout layout, VkImageSubresourceRange range); void insert_texture_barrier(VkCommandBuffer cmd, vk::image *image); - void insert_buffer_memory_barrier(VkCommandBuffer cmd, VkBuffer buffer, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src_mask, VkAccessFlags dst_mask); + void insert_buffer_memory_barrier(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize length, + VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src_mask, VkAccessFlags dst_mask); //Manage 'uininterruptible' state where secondary operations (e.g violation handlers) will have to wait void enter_uninterruptible(); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 8b70592a34..fa534e007e 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -152,7 +152,10 @@ namespace vk } else { - insert_buffer_memory_barrier(cmd, scratch_buf->value, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + const auto elem_size = vk::get_format_texel_width(src->info.format); + const auto length = elem_size * src_copy.imageExtent.width * src_copy.imageExtent.height; + + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); vk::cs_shuffle_base *shuffle_kernel = nullptr; @@ -177,12 +180,9 @@ namespace vk } } - const auto elem_size = vk::get_format_texel_width(src->info.format); - const auto length = elem_size * src_copy.imageExtent.width * src_copy.imageExtent.height; - shuffle_kernel->run(cmd, scratch_buf, length); - insert_buffer_memory_barrier(cmd, scratch_buf->value, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); } } @@ -338,7 +338,7 @@ namespace vk info.imageSubresource = { aspect & transfer_flags, 0, 0, 1 }; vkCmdCopyImageToBuffer(cmd, src, preferred_src_format, scratch_buf->value, 1, &info); - insert_buffer_memory_barrier(cmd, scratch_buf->value, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, VK_WHOLE_SIZE, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); info.imageSubresource = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 }; vkCmdCopyBufferToImage(cmd, scratch_buf->value, typeless, VK_IMAGE_LAYOUT_GENERAL, 1, &info); @@ -352,7 +352,7 @@ namespace vk info.imageOffset = { 0, (s32)src_h, 0 }; vkCmdCopyImageToBuffer(cmd, typeless, VK_IMAGE_LAYOUT_GENERAL, scratch_buf->value, 1, &info); - insert_buffer_memory_barrier(cmd, scratch_buf->value, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, VK_WHOLE_SIZE, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); info.imageOffset = { dst_rect.x1, dst_rect.y1, 0 }; info.imageSubresource = { aspect & transfer_flags, 0, 0, 1 }; @@ -432,7 +432,6 @@ namespace vk u32 mipmap_level = 0; u32 block_in_pixel = get_format_block_size_in_texel(format); u8 block_size_in_bytes = get_format_block_size_in_bytes(format); - std::vector staging_buffer; //TODO: Depth and stencil transfer together flags &= ~(VK_IMAGE_ASPECT_STENCIL_BIT); @@ -447,49 +446,32 @@ namespace vk void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8); void *dst = mapped_buffer; - bool use_staging = false; - if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT || - dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) + if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT) { //Misalign intentionally to skip the first stencil byte in D24S8 data //Ensures the real depth data is dword aligned - if (dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) - { - //Emulate D24x8 passthrough to D32 format - //Reads from GPU managed memory are slow at best and at worst unreliable - use_staging = true; - staging_buffer.resize(image_linear_size + 8); - dst = staging_buffer.data() + 4 - 1; - } - else - { - //Skip leading dword when writing to texture - offset_in_buffer += 4; - dst = (char*)(mapped_buffer) + 4 - 1; - } + //Skip leading dword when writing to texture + offset_in_buffer += 4; + dst = (char*)(mapped_buffer) + 4 - 1; } gsl::span mapped{ (gsl::byte*)dst, ::narrow(image_linear_size) }; upload_texture_subresource(mapped, layout, format, is_swizzled, false, 256); - - if (use_staging) - { - if (dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) - { - //Map depth component from D24x8 to a f32 depth value - //NOTE: One byte (contains first S8 value) is skipped - rsx::convert_le_d24x8_to_le_f32(mapped_buffer, (char*)dst + 1, image_linear_size >> 2, 1); - } - else //unused - { - //Copy emulated data back to the target buffer - memcpy(mapped_buffer, dst, image_linear_size); - } - } - upload_heap.unmap(); + if (dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) + { + // Run GPU compute task to convert the D24x8 to FP32 + // NOTE: On commandbuffer submission, the HOST_WRITE to ALL_COMMANDS barrier is implicitly inserted according to spec + // No need to add another explicit barrier unless a driver bug is found + + vk::get_compute_task()->run(cmd, upload_heap.heap.get(), image_linear_size, offset_in_buffer); + + insert_buffer_memory_barrier(cmd, upload_heap.heap->value, offset_in_buffer, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + } + VkBufferImageCopy copy_info = {}; copy_info.bufferOffset = offset_in_buffer; copy_info.imageExtent.height = layout.height_in_block * block_in_pixel; diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 0bfaf0d7ff..69c67ea87b 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -2,6 +2,7 @@ #include "stdafx.h" #include "VKRenderTargets.h" #include "VKGSRender.h" +#include "VKCompute.h" #include "Emu/System.h" #include "../Common/TextureUtils.h" #include "../rsx_utils.h" @@ -220,6 +221,29 @@ namespace vk change_image_layout(cmd, vram_texture, old_layout, subresource_range); real_pitch = vk::get_format_texel_width(vram_texture->info.format) * transfer_width; + if (vram_texture->info.format == VK_FORMAT_D24_UNORM_S8_UINT) + { + vk::get_compute_task()->run(cmd, dma_buffer.get(), cpu_address_range); + } + else if (vram_texture->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) + { + vk::get_compute_task()->run(cmd, dma_buffer.get(), cpu_address_range); + } + else if (pack_unpack_swap_bytes) + { + const auto texel_layout = vk::get_format_element_size(vram_texture->info.format); + const auto elem_size = texel_layout.first; + + if (elem_size == 2) + { + vk::get_compute_task()->run(cmd, dma_buffer.get(), cpu_address_range); + } + else if (elem_size == 4) + { + vk::get_compute_task()->run(cmd, dma_buffer.get(), cpu_address_range); + } + } + if (manage_cb_lifetime) { cmd.end(); @@ -238,56 +262,6 @@ namespace vk sync_timestamp = get_system_time(); } - template - void do_memory_transfer_packed(void *pixels_dst, const void *pixels_src, u32 max_length) - { - if (sizeof(T) == 1 || !swapped) - { - memcpy(pixels_dst, pixels_src, max_length); - } - else - { - const u32 block_size = max_length / sizeof(T); - auto typed_dst = (be_t *)pixels_dst; - auto typed_src = (T *)pixels_src; - - for (u32 px = 0; px < block_size; ++px) - typed_dst[px] = typed_src[px]; - } - } - - template - void do_memory_transfer_padded(void *pixels_dst, const void *pixels_src, u32 src_pitch, u32 dst_pitch, u32 num_rows) - { - auto src = (char*)pixels_src; - auto dst = (char*)pixels_dst; - - if (sizeof(T) == 1 || !swapped) - { - for (u32 y = 0; y < num_rows; ++y) - { - memcpy(dst, src, src_pitch); - src += src_pitch; - dst += dst_pitch; - } - } - else - { - const u32 block_size = src_pitch / sizeof(T); - for (u32 y = 0; y < num_rows; ++y) - { - auto typed_dst = (be_t *)dst; - auto typed_src = (T *)src; - - for (u32 px = 0; px < block_size; ++px) - typed_dst[px] = typed_src[px]; - - src += src_pitch; - dst += dst_pitch; - } - } - } - bool flush(vk::command_buffer& cmd, VkQueue submit_queue) { if (flushed) return true; @@ -314,93 +288,26 @@ namespace vk void* pixels_src = dma_buffer->map(valid_range.first, valid_range.second); void* pixels_dst = get_raw_ptr(valid_range.first, true); - const auto texel_layout = vk::get_format_element_size(vram_texture->info.format); - const auto elem_size = texel_layout.first; - - auto memory_transfer_packed = [=]() - { - switch (elem_size) - { - default: - LOG_ERROR(RSX, "Invalid element width %d", elem_size); - case 1: - do_memory_transfer_packed(pixels_dst, pixels_src, valid_range.second); - break; - case 2: - if (pack_unpack_swap_bytes) - do_memory_transfer_packed(pixels_dst, pixels_src, valid_range.second); - else - do_memory_transfer_packed(pixels_dst, pixels_src, valid_range.second); - break; - case 4: - if (pack_unpack_swap_bytes) - do_memory_transfer_packed(pixels_dst, pixels_src, valid_range.second); - else - do_memory_transfer_packed(pixels_dst, pixels_src, valid_range.second); - break; - } - }; - - auto memory_transfer_padded = [=]() - { - const u32 num_rows = valid_range.second / rsx_pitch; - switch (elem_size) - { - default: - LOG_ERROR(RSX, "Invalid element width %d", elem_size); - case 1: - do_memory_transfer_padded(pixels_dst, pixels_src, real_pitch, rsx_pitch, num_rows); - break; - case 2: - if (pack_unpack_swap_bytes) - do_memory_transfer_padded(pixels_dst, pixels_src, real_pitch, rsx_pitch, num_rows); - else - do_memory_transfer_padded(pixels_dst, pixels_src, real_pitch, rsx_pitch, num_rows); - break; - case 4: - if (pack_unpack_swap_bytes) - do_memory_transfer_padded(pixels_dst, pixels_src, real_pitch, rsx_pitch, num_rows); - else - do_memory_transfer_padded(pixels_dst, pixels_src, real_pitch, rsx_pitch, num_rows); - break; - } - }; - - // NOTE: We have to do our own byte swapping since the driver doesnt do it for us - // TODO: Replace the cpu-side transformations with trivial compute pipelines if (real_pitch >= rsx_pitch || valid_range.second <= rsx_pitch) { - switch (vram_texture->info.format) - { - case VK_FORMAT_D32_SFLOAT_S8_UINT: - { - rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_src, valid_range.second >> 2, 1); - break; - } - case VK_FORMAT_D24_UNORM_S8_UINT: - { - rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_src, valid_range.second >> 2, 1); - break; - } - default: - { - memory_transfer_packed(); - break; - } - } + memcpy(pixels_dst, pixels_src, valid_range.second); } else { - memory_transfer_padded(); - - switch (vram_texture->info.format) + if (valid_range.second % rsx_pitch) { - case VK_FORMAT_D32_SFLOAT_S8_UINT: - rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_dst, valid_range.second >> 2, 1); - break; - case VK_FORMAT_D24_UNORM_S8_UINT: - rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_dst, valid_range.second >> 2, 1); - break; + fmt::throw_exception("Unreachable" HERE); + } + + const u32 num_rows = valid_range.second / rsx_pitch; + auto _src = (u8*)pixels_src; + auto _dst = (u8*)pixels_dst; + + for (u32 y = 0; y < num_rows; ++y) + { + memcpy(_dst, _src, real_pitch); + _src += real_pitch; + _dst += real_pitch; } }