From deb590cb05333ec7ec30f117f0fea5f3328ab056 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 27 Aug 2017 16:22:59 +0300 Subject: [PATCH] rsx/vk: Bug fixes - Make each frame context own its own memory - Fix GPU blit - Fix image layout transitions in flip vk: Improve frame-local memory usage tracking to prevent overwrites - Also slightly bumps VRAM requirements for stream buffers to help with running out of storage - Fixes flickering and missing graphics in some cases. Flickering is still there and needs more work vk: Up vertex attribute heap size and increase the guard size on it vulkan: Reorganize memory management vulkan: blit cleanup vulkan: blit engine improvements - Override existing image mapping when conflicts detected - Allow blitting of depth/stencil surfaces --- rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h | 8 +- rpcs3/Emu/RSX/GL/GLTextureCache.h | 2 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 89 ++++++---- rpcs3/Emu/RSX/VK/VKGSRender.h | 46 +++-- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 4 +- rpcs3/Emu/RSX/VK/VKHelpers.h | 6 + rpcs3/Emu/RSX/VK/VKTexture.cpp | 46 +++-- rpcs3/Emu/RSX/VK/VKTextureCache.h | 180 ++++++++++++++++---- rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp | 13 +- 9 files changed, 297 insertions(+), 97 deletions(-) diff --git a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h index 3c8a9ba157..6b1da86704 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h +++ b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h @@ -59,10 +59,10 @@ struct render_target_traits { //TODO auto desc = surface->GetDesc(); - info->rsx_pitch = desc.Width; - info->native_pitch = desc.Width; - info->surface_width = desc.Width; - info->surface_height = desc.Height; + info->rsx_pitch = static_cast(desc.Width); + info->native_pitch = static_cast(desc.Width); + info->surface_width = static_cast(desc.Width); + info->surface_height = static_cast(desc.Height); info->bpp = 1; } diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index cb57102199..bd8aee4e00 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -1290,7 +1290,7 @@ namespace gl //These textures are completely GPU resident so we dont watch for CPU access //There's no data to be fetched from the CPU //Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases - cached.protect(utils::protection::rw); + cached.protect(utils::protection::ro); cached.set_dirty(false); return true; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 51b7685fdd..63d73023f7 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -568,21 +568,8 @@ VKGSRender::VKGSRender() : GSRender() m_secondary_command_buffer_pool.create((*m_device)); m_secondary_command_buffer.create(m_secondary_command_buffer_pool); - //VRAM allocation - m_attrib_ring_info.init(VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000); - m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0)); - m_uniform_buffer_ring_info.init(VK_UBO_RING_BUFFER_SIZE_M * 0x100000); - m_uniform_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0)); - m_index_buffer_ring_info.init(VK_INDEX_RING_BUFFER_SIZE_M * 0x100000); - m_index_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0)); - m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000); - m_texture_upload_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, 0)); - - //Empty view to bind to buffer locations without data - m_null_buffer_view.reset(new vk::buffer_view(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, 0, 0)); - + //Precalculated stuff m_render_passes = get_precomputed_render_passes(*m_device, m_optimal_tiling_supported_formats); - std::tie(pipeline_layout, descriptor_layouts) = get_shared_pipeline_layout(*m_device); //Generate frame contexts @@ -595,15 +582,24 @@ VKGSRender::VKGSRender() : GSRender() VkSemaphoreCreateInfo semaphore_info = {}; semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + //VRAM allocation + m_attrib_ring_info.init(VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, 0x400000); + m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0)); + m_uniform_buffer_ring_info.init(VK_UBO_RING_BUFFER_SIZE_M * 0x100000); + m_uniform_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0)); + m_index_buffer_ring_info.init(VK_INDEX_RING_BUFFER_SIZE_M * 0x100000); + m_index_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0)); + m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000); + m_texture_upload_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, 0)); + for (auto &ctx : frame_context_storage) { - ctx = {}; vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.present_semaphore); ctx.descriptor_pool.create(*m_device, sizes.data(), static_cast(sizes.size())); } null_buffer = std::make_unique(*m_device, 32, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0); - null_buffer_view = std::make_unique(*m_device, null_buffer->value, VK_FORMAT_R32_SFLOAT, 0, 32); + null_buffer_view = std::make_unique(*m_device, null_buffer->value, VK_FORMAT_R8_UINT, 0, 32); vk::initialize_compiler_context(); @@ -664,8 +660,7 @@ VKGSRender::~VKGSRender() //Global resources vk::destroy_global_resources(); - //Data heaps/buffers - m_null_buffer_view.reset(); + //Heaps m_index_buffer_ring_info.heap.reset(); m_uniform_buffer_ring_info.heap.reset(); m_attrib_ring_info.heap.reset(); @@ -677,6 +672,15 @@ VKGSRender::~VKGSRender() //Frame context m_framebuffers_to_clean.clear(); + + if (m_current_frame == &m_aux_frame_context) + { + //Return resources back to the owner + m_current_frame = &frame_context_storage[m_current_queue_index]; + m_current_frame->swap_storage(m_aux_frame_context); + m_current_frame->grab_resources(m_aux_frame_context); + } + m_aux_frame_context.buffer_views_to_clean.clear(); m_aux_frame_context.samplers_to_clean.clear(); @@ -858,11 +862,6 @@ void VKGSRender::begin() CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0)); m_current_frame->used_descriptors = 0; - - m_uniform_buffer_ring_info.reset_allocation_stats(); - m_index_buffer_ring_info.reset_allocation_stats(); - m_attrib_ring_info.reset_allocation_stats(); - m_texture_upload_buffer_ring_info.reset_allocation_stats(); } if (m_attrib_ring_info.is_critical() || @@ -875,6 +874,12 @@ void VKGSRender::begin() flush_command_queue(true); m_vertex_cache->purge(); + m_index_buffer_ring_info.reset_allocation_stats(); + m_uniform_buffer_ring_info.reset_allocation_stats(); + m_attrib_ring_info.reset_allocation_stats(); + m_texture_upload_buffer_ring_info.reset_allocation_stats(); + m_current_frame->reset_heap_ptrs(); + std::chrono::time_point submit_end = steady_clock::now(); m_flip_time += std::chrono::duration_cast(submit_end - submit_start).count(); } @@ -1031,7 +1036,8 @@ void VKGSRender::end() continue; } - vk::image_view *texture0 = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.fragment_textures[i], m_rtts, m_memory_type_mapping, m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); + vk::image_view *texture0 = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.fragment_textures[i], m_rtts, m_memory_type_mapping, + m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); if (!texture0) { @@ -1085,7 +1091,8 @@ void VKGSRender::end() continue; } - vk::image_view *texture0 = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.vertex_textures[i], m_rtts, m_memory_type_mapping, m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); + vk::image_view *texture0 = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.vertex_textures[i], m_rtts, m_memory_type_mapping, + m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); if (!texture0) { @@ -1502,6 +1509,10 @@ void VKGSRender::advance_queued_frames() }); m_vertex_cache->purge(); + m_current_frame->tag_frame_end(m_attrib_ring_info.get_current_put_pos_minus_one(), + m_uniform_buffer_ring_info.get_current_put_pos_minus_one(), + m_index_buffer_ring_info.get_current_put_pos_minus_one(), + m_texture_upload_buffer_ring_info.get_current_put_pos_minus_one()); m_current_queue_index = (m_current_queue_index + 1) % VK_MAX_ASYNC_FRAMES; m_current_frame = &frame_context_storage[m_current_queue_index]; @@ -1574,6 +1585,17 @@ void VKGSRender::process_swap_request(frame_context_t *ctx, bool free_resources) ctx->buffer_views_to_clean.clear(); ctx->samplers_to_clean.clear(); + + if (ctx->last_frame_sync_time > m_last_heap_sync_time) + { + m_last_heap_sync_time = ctx->last_frame_sync_time; + + //Heap cleanup; deallocates memory consumed by the frame if it is still held + m_attrib_ring_info.m_get_pos = ctx->attrib_heap_ptr; + m_uniform_buffer_ring_info.m_get_pos = ctx->ubo_heap_ptr; + m_index_buffer_ring_info.m_get_pos = ctx->index_heap_ptr; + m_texture_upload_buffer_ring_info.m_get_pos = ctx->texture_upload_heap_ptr; + } } ctx->swap_command_buffer = nullptr; @@ -1629,11 +1651,17 @@ bool VKGSRender::check_program_status() auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple { vk::render_target *surface = nullptr; + if (!is_depth) surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr); else + { surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr); + if (!surface && m_texture_cache.is_depth_texture(texaddr, m_rtts)) + return std::make_tuple(true, 0); + } + if (!surface) return std::make_tuple(false, 0); return std::make_tuple(true, surface->native_pitch); @@ -2366,7 +2394,7 @@ void VKGSRender::flip(int buffer) barrier.oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; barrier.image = target_image; barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.subresourceRange = subres; @@ -2496,11 +2524,6 @@ void VKGSRender::flip(int buffer) std::chrono::time_point flip_end = steady_clock::now(); m_flip_time = std::chrono::duration_cast(flip_end - flip_start).count(); - m_uniform_buffer_ring_info.reset_allocation_stats(); - m_index_buffer_ring_info.reset_allocation_stats(); - m_attrib_ring_info.reset_allocation_stats(); - m_texture_upload_buffer_ring_info.reset_allocation_stats(); - //NOTE:Resource destruction is handled within the real swap handler m_frame->flip(m_context); @@ -2526,6 +2549,8 @@ void VKGSRender::flip(int buffer) bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) { + close_render_pass(); + return m_texture_cache.upload_scaled_image(src, dst, interpolate, (*m_device), *m_current_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue(), m_rtts, m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); -} +} \ No newline at end of file diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 6feb21270a..ce62b503e8 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -25,11 +25,11 @@ namespace vk using shader_cache = rsx::shaders_cache; } -//Heap allocation sizes in MB +//Heap allocation sizes in MB - each 'frame' owns a private heap, one of each kind #define VK_ATTRIB_RING_BUFFER_SIZE_M 256 #define VK_UBO_RING_BUFFER_SIZE_M 64 #define VK_INDEX_RING_BUFFER_SIZE_M 64 -#define VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M 128 +#define VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M 64 #define VK_MAX_ASYNC_CB_COUNT 64 #define VK_MAX_ASYNC_FRAMES 2 @@ -117,8 +117,6 @@ private: vk::glsl::program *m_program; vk::context m_thread_context; - vk::vk_data_heap m_attrib_ring_info; - vk::texture_cache m_texture_cache; rsx::vk_render_targets m_rtts; @@ -141,12 +139,6 @@ private: vk::render_device *m_device; vk::swap_chain* m_swap_chain; - //buffer - vk::vk_data_heap m_uniform_buffer_ring_info; - vk::vk_data_heap m_index_buffer_ring_info; - vk::vk_data_heap m_texture_upload_buffer_ring_info; - std::unique_ptr m_null_buffer_view; - //Vulkan internals vk::command_pool m_command_buffer_pool; @@ -165,6 +157,12 @@ private: std::unique_ptr m_draw_fbo; + u64 m_last_heap_sync_time = 0; + vk::vk_data_heap m_attrib_ring_info; + vk::vk_data_heap m_uniform_buffer_ring_info; + vk::vk_data_heap m_index_buffer_ring_info; + vk::vk_data_heap m_texture_upload_buffer_ring_info; + struct frame_context_t { VkSemaphore present_semaphore = VK_NULL_HANDLE; @@ -178,6 +176,14 @@ private: u32 present_image = UINT32_MAX; command_buffer_chunk* swap_command_buffer = nullptr; + //Heap pointers + s64 attrib_heap_ptr = 0; + s64 ubo_heap_ptr = 0; + s64 index_heap_ptr = 0; + s64 texture_upload_heap_ptr = 0; + + u64 last_frame_sync_time = 0; + //Copy shareable information void grab_resources(frame_context_t &other) { @@ -185,6 +191,11 @@ private: descriptor_set = other.descriptor_set; descriptor_pool = other.descriptor_pool; used_descriptors = other.used_descriptors; + + attrib_heap_ptr = other.attrib_heap_ptr; + ubo_heap_ptr = other.attrib_heap_ptr; + index_heap_ptr = other.attrib_heap_ptr; + texture_upload_heap_ptr = other.texture_upload_heap_ptr; } //Exchange storage (non-copyable) @@ -193,6 +204,21 @@ private: std::swap(buffer_views_to_clean, other.buffer_views_to_clean); std::swap(samplers_to_clean, other.samplers_to_clean); } + + void tag_frame_end(s64 attrib_loc, s64 ubo_loc, s64 index_loc, s64 texture_loc) + { + attrib_heap_ptr = attrib_loc; + ubo_heap_ptr = ubo_loc; + index_heap_ptr = index_loc; + texture_upload_heap_ptr = texture_loc; + + last_frame_sync_time = get_system_time(); + } + + void reset_heap_ptrs() + { + last_frame_sync_time = 0; + } }; std::array frame_context_storage; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index a42e2fffd6..302d16c756 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -233,7 +233,7 @@ namespace vk break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: - barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_MEMORY_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: @@ -258,7 +258,7 @@ namespace vk break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: - barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_MEMORY_READ_BIT; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index b1f79294a5..d100412fb3 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -458,6 +458,12 @@ namespace vk CHECK_RESULT(vkCreateImageView(m_device, &info, nullptr, &value)); } + image_view(VkDevice dev, VkImageViewCreateInfo create_info) + : m_device(dev), info(create_info) + { + CHECK_RESULT(vkCreateImageView(m_device, &info, nullptr, &value)); + } + ~image_view() { vkDestroyImageView(m_device, value, nullptr); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index fcc92ef756..1ba4960bc3 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -97,27 +97,47 @@ namespace vk a_dst = a_src; //TODO: Use an array of offsets/dimensions for mipmapped blits (mipmap count > 1) since subimages will have different dimensions - - VkImageBlit rgn = {}; - rgn.srcOffsets[0] = { (int32_t)src_x_offset, (int32_t)src_y_offset, 0 }; - rgn.srcOffsets[1] = { (int32_t)src_width, (int32_t)src_height, 1 }; - rgn.dstOffsets[0] = { (int32_t)dst_x_offset, (int32_t)dst_y_offset, 0 }; - rgn.dstOffsets[1] = { (int32_t)(dst_width + dst_x_offset), (int32_t)(dst_height + dst_y_offset), 1 }; - rgn.dstSubresource = a_dst; - rgn.srcSubresource = a_src; - if (srcLayout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) change_image_layout(cmd, src, srcLayout, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk::get_image_subresource_range(0, 0, 1, 1, aspect)); if (dstLayout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) change_image_layout(cmd, dst, dstLayout, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk::get_image_subresource_range(0, 0, 1, 1, aspect)); - for (u32 mip_level = 0; mip_level < mipmaps; ++mip_level) + if (src_width != dst_width || src_height != dst_height || mipmaps > 1) { - vkCmdBlitImage(cmd, src, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &rgn, VK_FILTER_LINEAR); + if ((aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) != 0) + { + //Most depth/stencil formats cannot be scaled using hw blit + LOG_ERROR(RSX, "Cannot perform scaled blit for depth/stencil images"); + return; + } - rgn.srcSubresource.mipLevel++; - rgn.dstSubresource.mipLevel++; + VkImageBlit rgn = {}; + rgn.srcOffsets[0] = { (int32_t)src_x_offset, (int32_t)src_y_offset, 0 }; + rgn.srcOffsets[1] = { (int32_t)(src_width + src_x_offset), (int32_t)(src_height + src_y_offset), 1 }; + rgn.dstOffsets[0] = { (int32_t)dst_x_offset, (int32_t)dst_y_offset, 0 }; + rgn.dstOffsets[1] = { (int32_t)(dst_width + dst_x_offset), (int32_t)(dst_height + dst_y_offset), 1 }; + rgn.dstSubresource = a_dst; + rgn.srcSubresource = a_src; + + for (u32 mip_level = 0; mip_level < mipmaps; ++mip_level) + { + vkCmdBlitImage(cmd, src, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &rgn, VK_FILTER_LINEAR); + + rgn.srcSubresource.mipLevel++; + rgn.dstSubresource.mipLevel++; + } + } + else + { + VkImageCopy copy_rgn; + copy_rgn.srcOffset = { (int32_t)src_x_offset, (int32_t)src_y_offset, 0 }; + copy_rgn.dstOffset = { (int32_t)dst_x_offset, (int32_t)dst_y_offset, 0 }; + copy_rgn.dstSubresource = { (VkImageAspectFlags)aspect, 0, 0, 1 }; + copy_rgn.srcSubresource = { (VkImageAspectFlags)aspect, 0, 0, 1 }; + copy_rgn.extent = { src_width, src_height, 1 }; + + vkCmdCopyImage(cmd, src, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_rgn); } if (srcLayout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index d58e10a5ed..f05b9a2565 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -607,6 +607,40 @@ namespace vk purge_cache(); } + bool is_depth_texture(const u32 texaddr, rsx::vk_render_targets &m_rtts) + { + if (m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr)) + return true; + + reader_lock lock(m_cache_mutex); + + auto found = m_cache.find(texaddr); + if (found == m_cache.end()) + return false; + + if (found->second.valid_count == 0) + return false; + + for (auto& tex : found->second.data) + { + if (tex.is_dirty()) + continue; + + switch (tex.get_format()) + { + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + case VK_FORMAT_D24_UNORM_S8_UINT: + return true; + default: + return false; + } + } + + //Unreachable; silence compiler warning anyway + return false; + } + template vk::image_view* upload_texture(command_buffer &cmd, RsxTextureType &tex, rsx::vk_render_targets &m_rtts, const vk::memory_type_mapping &memory_type_mapping, vk_data_heap& upload_heap, vk::buffer* upload_buffer) { @@ -652,12 +686,6 @@ namespace vk return rtt_texture->get_view(); } - u32 raw_format = tex.format(); - u32 format = raw_format & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN); - - VkComponentMapping mapping = get_component_map(tex, format); - VkFormat vk_format = get_compatible_sampler_format(format); - VkImageType image_type; VkImageViewType image_view_type; u16 height = 0; @@ -696,7 +724,8 @@ namespace vk break; } - cached_texture_section& region = find_cached_texture(texaddr, range, true, tex.width(), height, tex.get_exact_mipmap_count()); + //Ignoring the mipmaps count is intentional - its common for games to pass in incorrect values as mipmap count + cached_texture_section& region = find_cached_texture(texaddr, range, true, tex.width(), height, 0); if (region.exists() && !region.is_dirty()) { return region.get_view().get(); @@ -712,6 +741,12 @@ namespace vk return nullptr; } + u32 raw_format = tex.format(); + u32 format = raw_format & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN); + + VkComponentMapping mapping = get_component_map(tex, format); + VkFormat vk_format = get_compatible_sampler_format(format); + vk::image *image = new vk::image(*vk::get_current_renderer(), memory_type_mapping.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, image_type, vk_format, @@ -1104,16 +1139,20 @@ namespace vk bool dst_is_argb8 = (dst.format == rsx::blit_engine::transfer_destination_format::a8r8g8b8); bool src_is_argb8 = (src.format == rsx::blit_engine::transfer_source_format::a8r8g8b8); - VkFormat src_vk_format = src_is_argb8 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; + const VkComponentMapping rgba_map = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }; + const VkComponentMapping bgra_map = { VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_A }; + + auto dest_mapping = (!dst_is_argb8 || dst.swizzled) ? bgra_map : rgba_map; vk::image* vram_texture = nullptr; vk::image* dest_texture = nullptr; + cached_texture_section* cached_dest = nullptr; const u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0)); const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0)); //Check if src/dst are parts of render targets - auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true); + auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, false); dst_is_render_target = dst_subres.surface != nullptr; u16 max_dst_width = dst.width; @@ -1165,7 +1204,7 @@ namespace vk { //First check if this surface exists in VRAM with exact dimensions //Since scaled GPU resources are not invalidated by the CPU, we need to reuse older surfaces if possible - auto cached_dest = find_texture_from_dimensions(dst.rsx_address, dst.pitch * dst.clip_height, dst_dimensions.width, dst_dimensions.height); + cached_dest = find_texture_from_dimensions(dst.rsx_address, dst.pitch * dst.clip_height, dst_dimensions.width, dst_dimensions.height); //Check for any available region that will fit this one if (!cached_dest) cached_dest = find_texture_from_range(dst.rsx_address, dst.pitch * dst.clip_height); @@ -1187,6 +1226,21 @@ namespace vk max_dst_width = cached_dest->get_width(); max_dst_height = cached_dest->get_height(); + + //If dest has a component swizzle (usually caused by ARGB->BGRA compatibility when uploading from cpu) remove it + auto& image_view = cached_dest->get_view(); + + if (image_view->info.components.a != dest_mapping.a || + image_view->info.components.r != dest_mapping.r || + image_view->info.components.g != dest_mapping.g || + image_view->info.components.b != dest_mapping.b) + { + auto create_info = image_view->info; + create_info.components = dest_mapping; + + m_temporary_image_view.push_back(std::move(image_view)); + image_view.reset(new vk::image_view(dev, create_info)); + } } else if (is_memcpy) { @@ -1220,7 +1274,7 @@ namespace vk } //TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate - auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true); + auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, false); src_is_render_target = src_subres.surface != nullptr; //Create source texture if does not exist @@ -1235,7 +1289,9 @@ namespace vk else { flush_address(src_address, dev, cmd, memory_types, submit_queue); - writer_lock lock(m_cache_mutex); + + const VkFormat src_vk_format = src_is_argb8 ? VK_FORMAT_R8G8B8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; + const VkComponentMapping component_mapping = (!src_is_argb8 || dst.swizzled) ? bgra_map : rgba_map; //Upload texture from CPU vk::image *image = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, @@ -1245,12 +1301,15 @@ namespace vk VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0); vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), image->value, VK_IMAGE_VIEW_TYPE_2D, src_vk_format, - { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }, - { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); + component_mapping, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); - cached_texture_section& region = find_cached_texture(dst.rsx_address, src.pitch * src.slice_h, true, src.width, src.slice_h, 1); - region.reset(src.rsx_address, src.pitch * src.slice_h); - region.create(src.width, src.slice_h, 1, 1, view, dest_texture); + change_image_layout(cmd, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); + + cached_texture_section& region = find_cached_texture(src_address, src.pitch * src.slice_h, true, src.width, src.slice_h, 1); + + writer_lock lock(m_cache_mutex); + region.reset(src_address, src.pitch * src.slice_h); + region.create(src.width, src.slice_h, 1, 1, view, image); region.protect(utils::protection::ro); region.set_dirty(false); @@ -1262,14 +1321,16 @@ namespace vk auto &subres = layout.back(); subres.width_in_block = src.width; subres.height_in_block = src.slice_h; - subres.pitch_in_bytes = src.pitch; + subres.pitch_in_bytes = src.width; //Seems to be a typo - should be pitch_in_block subres.depth = 1; - subres.data = {(const gsl::byte*)src.pixels, src.pitch * src.slice_h}; + subres.data = {(const gsl::byte*)src.pixels, align(src.pitch, 256) * src.slice_h}; - copy_mipmaped_image_using_buffer(cmd, image->value, layout, src_vk_format, false, 1, - upload_heap, upload_buffer); + copy_mipmaped_image_using_buffer(cmd, image->value, layout, src_is_argb8? CELL_GCM_TEXTURE_A8R8G8B8: CELL_GCM_TEXTURE_R5G6B5, + false, 1, upload_heap, upload_buffer); vk::leave_uninterruptible(); + + vram_texture = image; } } else @@ -1294,6 +1355,54 @@ namespace vk vram_texture = src_subres.surface; } + VkImageAspectFlags aspect_to_copy = VK_IMAGE_ASPECT_COLOR_BIT; + bool dest_exists = dest_texture != nullptr; + VkFormat dst_vk_format = dst_is_argb8 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; + const u8 bpp = dst_is_argb8 ? 4 : 2; + const u32 real_width = dst.pitch / bpp; + + //If src is depth, dest has to be depth as well + if (src_subres.is_depth_surface) + { + if (dest_exists) + { + if (dst_is_render_target && !dst_subres.is_depth_surface) + { + LOG_ERROR(RSX, "Depth->RGBA blit requested but not supported"); + return true; + } + + if (!dst_is_render_target) + { + if (dest_texture->info.format != src_subres.surface->info.format) + { + cached_dest->unprotect(); + cached_dest->set_dirty(true); + + dest_exists = false; + cached_dest = nullptr; + } + } + else + { + if (dst_subres.surface->info.format != src_subres.surface->info.format) + { + LOG_ERROR(RSX, "Depth blit requested, but formats do not match (0x%X vs 0x%X)", + (u32)dst_subres.surface->info.format, (u32)src_subres.surface->info.format); + return true; + } + } + } + + dst_vk_format = src_subres.surface->info.format; + dest_mapping = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R }; + + if (dst_vk_format == VK_FORMAT_D16_UNORM) + aspect_to_copy = VK_IMAGE_ASPECT_DEPTH_BIT; + else + aspect_to_copy = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + } + //Validate clip offsets (Persona 4 Arena at 720p) //Check if can fit //NOTE: It is possible that the check is simpler (if (clip_x >= clip_width)) @@ -1313,11 +1422,6 @@ namespace vk src_area.y2 += scaled_clip_offset_y; } - bool dest_exists = dest_texture != nullptr; - const VkFormat dst_vk_format = dst_is_argb8 ? VK_FORMAT_R8G8B8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; - const u8 bpp = dst_is_argb8 ? 4 : 2; - const u32 real_width = dst.pitch / bpp; - if (!dest_exists) { dest_texture = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, @@ -1325,15 +1429,32 @@ namespace vk dst_vk_format, real_width, dst.clip_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0); + + change_image_layout(cmd, dest_texture, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, { aspect_to_copy, 0, 1, 0, 1}); } //Copy data + u32 src_width = src_area.x2 - src_area.x1; + u32 src_height = src_area.y2 - src_area.y1; + u32 dst_width = dst_area.x2 - dst_area.x1; + u32 dst_height = dst_area.y2 - dst_area.y1; + + if (dst.clip_width != dst_width || + dst.clip_height != dst_height) + { + //clip reproject + src_width = (src_width * dst.clip_width) / dst_width; + src_height = (src_height * dst.clip_height) / dst_height; + } + copy_scaled_image(cmd, vram_texture->value, dest_texture->value, vram_texture->current_layout, dest_texture->current_layout, - src_area.x1, src_area.y1, src_w, src_h, dst_area.x1, dst_area.y1, dst.clip_width, dst.clip_height, 1, VK_IMAGE_ASPECT_COLOR_BIT); + src_area.x1, src_area.y1, src_width, src_height, dst_offset.x, dst_offset.y, dst.clip_width, dst.clip_height, 1, (VkImageAspectFlagBits)aspect_to_copy); if (dest_exists) return true; + change_image_layout(cmd, dest_texture, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, { aspect_to_copy, 0, 1, 0, 1 }); + //TODO: Verify if any titles ever scale into CPU memory. It defeats the purpose of uploading data to the GPU, but it could happen //If so, add this texture to the no_access queue not the read_only queue cached_texture_section& region = find_cached_texture(dst.rsx_address, dst.pitch * dst.clip_height, true, real_width, dst.clip_height, 1); @@ -1344,12 +1465,11 @@ namespace vk //Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), dest_texture->value, VK_IMAGE_VIEW_TYPE_2D, dst_vk_format, - { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }, - { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); + dest_mapping, { aspect_to_copy & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 1, 0, 1 }); region.reset(dst.rsx_address, dst.pitch * dst.clip_height); region.create(real_width, dst.clip_height, 1, 1, view, dest_texture); - region.protect(utils::protection::rw); + region.protect(utils::protection::ro); region.set_dirty(false); read_only_range = region.get_min_max(read_only_range); diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index 5a340112fd..385bbe515f 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -230,7 +230,8 @@ VKGSRender::upload_vertex_data() if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first)) { in_cache = true; - m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, cached->offset_in_heap, required.first)); + m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, + m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, cached->offset_in_heap, required.first)); } else { @@ -241,7 +242,8 @@ VKGSRender::upload_vertex_data() if (!in_cache) { persistent_offset = (u32)m_attrib_ring_info.alloc<256>(required.first); - m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_offset, required.first)); + m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, + m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_offset, required.first)); if (to_store) { @@ -254,19 +256,20 @@ VKGSRender::upload_vertex_data() } else { - persistent_view = m_null_buffer_view->value; + persistent_view = null_buffer_view->value; } if (required.second > 0) { volatile_offset = (u32)m_attrib_ring_info.alloc<256>(required.second); - m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_offset, required.second)); + m_current_frame->buffer_views_to_clean.push_back(std::make_unique(*m_device, + m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_offset, required.second)); volatile_view = m_current_frame->buffer_views_to_clean.back()->value; } else { - volatile_view = m_null_buffer_view->value; + volatile_view = null_buffer_view->value; } m_program->bind_uniform(persistent_view, "persistent_input_stream", m_current_frame->descriptor_set);