From 67949bb5b784ad72094507ca685299aad0a1a387 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 19 Jan 2021 00:40:56 +0300 Subject: [PATCH] vk/dma: Allow interoperability between pass-through and write-back DMA caching types --- rpcs3/Emu/RSX/VK/VKDMA.cpp | 221 +++++++++++++++------------------ rpcs3/Emu/RSX/VK/VKDMA.h | 17 +-- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 6 +- rpcs3/Emu/RSX/VK/VKTexture.cpp | 23 +++- 4 files changed, 123 insertions(+), 144 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index b03fb10264..b93edbf12d 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -12,16 +12,15 @@ namespace vk { static constexpr usz s_dma_block_length = 0x00010000; static constexpr u32 s_dma_block_mask = 0xFFFF0000; - //static constexpr u32 s_dma_offset_mask = 0x0000FFFF; - - static constexpr u32 s_page_size = 65536; - static constexpr u32 s_page_align = s_page_size - 1; - static constexpr u32 s_pages_per_entry = 32; - static constexpr u32 s_bits_per_page = 2; - static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry); std::unordered_map> g_dma_pool; + dma_block::~dma_block() + { + // Use safe free (uses gc to clean up) + free(); + } + void* dma_block::map_range(const utils::address_range& range) { if (inheritance_info.parent) @@ -49,19 +48,24 @@ namespace vk void dma_block::allocate(const render_device& dev, usz size) { - if (allocated_memory) - { - // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline - // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. - auto gc = vk::get_resource_manager(); - gc->dispose(allocated_memory); - } + // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline + // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. + free(); allocated_memory = std::make_unique(dev, size, dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); } + void dma_block::free() + { + if (allocated_memory) + { + auto gc = vk::get_resource_manager(); + gc->dispose(allocated_memory); + } + } + void dma_block::init(const render_device& dev, u32 addr, usz size) { ensure(size); @@ -69,7 +73,6 @@ namespace vk base_address = addr; allocate(dev, size); - page_info.resize(size / s_bytes_per_entry, ~0ull); } void dma_block::init(dma_block* parent, u32 addr, usz size) @@ -79,67 +82,6 @@ namespace vk inheritance_info.block_offset = (addr - parent->base_address); } - void dma_block::set_page_bit(u32 offset, u64 bits) - { - const auto entry = (offset / s_bytes_per_entry); - const auto word = entry / s_pages_per_entry; - const auto shift = (entry % s_pages_per_entry) * s_bits_per_page; - - page_info[word] &= ~(3 << shift); - page_info[word] |= (bits << shift); - } - - bool dma_block::test_page_bit(u32 offset, u64 bits) - { - const auto entry = (offset / s_bytes_per_entry); - const auto word = entry / s_pages_per_entry; - const auto shift = (entry % s_pages_per_entry) * s_bits_per_page; - - return !!(page_info[word] & (bits << shift)); - } - - void dma_block::mark_dirty(const utils::address_range& range) - { - if (!inheritance_info.parent) - { - const u32 start = utils::align(range.start, s_page_size); - const u32 end = ((range.end + 1) & s_page_align); - - for (u32 page = start; page < end; page += s_page_size) - { - set_page_bit(page - base_address, page_bits::dirty); - } - - if (start > range.start) [[unlikely]] - { - set_page_bit(start - s_page_size, page_bits::nocache); - } - - if (end < range.end) [[unlikely]] - { - set_page_bit(end + s_page_size, page_bits::nocache); - } - } - else - { - inheritance_info.parent->mark_dirty(range); - } - } - - void dma_block::set_page_info(u32 page_offset, const std::vector& bits) - { - if (!inheritance_info.parent) - { - auto bit_offset = page_offset / s_bytes_per_entry; - ensure(bit_offset + bits.size() <= page_info.size()); - std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size()); - } - else - { - inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits); - } - } - void dma_block::flush(const utils::address_range& range) { auto src = map_range(range); @@ -206,11 +148,10 @@ namespace vk { // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. - auto gc = vk::get_resource_manager(); - gc->dispose(allocated_memory); + free(); - parent->set_page_info(inheritance_info.block_offset, page_info); - page_info.clear(); + //parent->set_page_info(inheritance_info.block_offset, page_info); + //page_info.clear(); } } @@ -222,8 +163,8 @@ namespace vk allocate(dev, new_size); - const auto required_entries = new_size / s_bytes_per_entry; - page_info.resize(required_entries, ~0ull); + //const auto required_entries = new_size / s_bytes_per_entry; + //page_info.resize(required_entries, ~0ull); } u32 dma_block::start() const @@ -244,13 +185,9 @@ namespace vk void dma_block_EXT::allocate(const render_device& dev, usz size) { - if (allocated_memory) - { - // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline - // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. - auto gc = vk::get_resource_manager(); - gc->dispose(allocated_memory); - } + // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline + // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. + free(); allocated_memory = std::make_unique(dev, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, @@ -278,16 +215,53 @@ namespace vk // NOP } - void create_dma_block(std::unique_ptr& block) + bool test_host_pointer(u32 base_address, usz length) { +#if 0 // Unusable due to vm locks + auto block = vm::get(vm::any, base_address); + ensure(block); + + if ((block->addr + block->size) < (base_address + length)) + { + return false; + } + + if (block->flags & 0x120) + { + return true; + } + + auto range_info = block->peek(base_address, u32(length)); + return !!range_info.second; +#endif + #ifdef _WIN32 - const bool allow_host_buffers = true; + MEMORY_BASIC_INFORMATION mem_info; + if (!::VirtualQuery(vm::get_super_ptr(base_address), &mem_info, sizeof(mem_info))) + { + rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError()); + return false; + } + + return (mem_info.RegionSize >= length); #else - // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations + return true; // *nix behavior is unknown with NVIDIA drivers +#endif + } + + void create_dma_block(std::unique_ptr& block, u32 base_address, u32 expected_length) + { const auto vendor = g_render_device->gpu().get_driver_vendor(); + +#ifdef _WIN32 + const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ? + test_host_pointer(base_address, expected_length) : + true; +#else + // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV); #endif - if (g_render_device->get_external_memory_host_support() && allow_host_buffers) + if (allow_host_buffers && g_render_device->get_external_memory_host_support()) { block.reset(new dma_block_EXT()); } @@ -295,6 +269,8 @@ namespace vk { block.reset(new dma_block()); } + + block->init(*g_render_device, base_address, expected_length); } std::pair map_dma(const command_buffer& cmd, u32 local_address, u32 length) @@ -315,25 +291,28 @@ namespace vk if (first_block == last_block) [[likely]] { auto &block_info = g_dma_pool[first_block]; - if (!block_info) create_dma_block(block_info); - - block_info->init(*g_render_device, first_block, s_dma_block_length); + ensure(!block_info); + create_dma_block(block_info, first_block, s_dma_block_length); return block_info->get(map_range); } dma_block* block_head = nullptr; auto block_end = utils::align(limit, s_dma_block_length); - // Reverse scan to try and find the minimum required length in case of other chaining - for (auto block = last_block; block != first_block; block -= s_dma_block_length) + if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA || + rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL) { - if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) + // Reverse scan to try and find the minimum required length in case of other chaining + for (auto block = last_block; block != first_block; block -= s_dma_block_length) { - const auto end = found->second->end(); - last_block = std::max(last_block, end & s_dma_block_mask); - block_end = std::max(block_end, end + 1); + if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) + { + const auto end = found->second->end(); + last_block = std::max(last_block, end & s_dma_block_mask); + block_end = std::max(block_end, end + 1); - break; + break; + } } } @@ -342,37 +321,31 @@ namespace vk auto found = g_dma_pool.find(block); auto &entry = g_dma_pool[block]; - const bool exists = !!entry; - if (!exists) create_dma_block(entry); - if (block == first_block) { - block_head = entry->head(); - - if (exists) + if (entry && entry->end() < limit) { - if (entry->end() < limit) - { - auto new_length = block_end - block_head->start(); - block_head->extend(cmd, *g_render_device, new_length); - } + // Then the references to this object do not go to the end of the list as will be done with this new allocation. + // A dumb release is therefore safe... + entry.reset(); } - else + + if (!entry) { auto required_size = (block_end - block); - block_head->init(*g_render_device, block, required_size); + create_dma_block(entry, block, required_size); } + + block_head = entry->head(); + } + else if (entry) + { + entry->set_parent(cmd, block_head); } else { - if (exists) - { - entry->set_parent(cmd, block_head); - } - else - { - entry->init(block_head, block, s_dma_block_length); - } + entry.reset(new dma_block()); + entry->init(block_head, block, s_dma_block_length); } } diff --git a/rpcs3/Emu/RSX/VK/VKDMA.h b/rpcs3/Emu/RSX/VK/VKDMA.h index cfdeb146b1..c0f0789f9a 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.h +++ b/rpcs3/Emu/RSX/VK/VKDMA.h @@ -13,13 +13,6 @@ namespace vk class dma_block { protected: - enum page_bits - { - synchronized = 0, - dirty = 1, - nocache = 3 - }; - struct { dma_block* parent = nullptr; @@ -29,19 +22,17 @@ namespace vk u32 base_address = 0; std::unique_ptr allocated_memory; - std::vector page_info; virtual void allocate(const render_device& dev, usz size); + virtual void free(); virtual void* map_range(const utils::address_range& range); virtual void unmap(); - void set_page_bit(u32 page, u64 bits); - bool test_page_bit(u32 page, u64 bits); - void mark_dirty(const utils::address_range& range); - void set_page_info(u32 page_offset, const std::vector& bits); - public: + dma_block() = default; + virtual ~dma_block(); + virtual void init(const render_device& dev, u32 addr, usz size); virtual void init(dma_block* parent, u32 addr, usz size); virtual void flush(const utils::address_range& range); diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index 8b4941b9c9..ae0cdc8ce9 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -70,7 +70,6 @@ namespace vk vk::clear_resolve_helpers(); vk::clear_dma_resources(); vk::vmm_reset(); - vk::get_resource_manager()->destroy(); vk::clear_scratch_resources(); vk::get_upload_heap()->destroy(); @@ -86,6 +85,9 @@ namespace vk p.second->destroy(); } g_overlay_passes.clear(); + + // This must be the last item destroyed + vk::get_resource_manager()->destroy(); } const vk::render_device *get_current_renderer() @@ -263,8 +265,6 @@ namespace vk return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0; } - - void do_query_cleanup(vk::command_buffer& cmd) { auto renderer = dynamic_cast(rsx::get_current_renderer()); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index d748b39bec..694edc504c 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -905,6 +905,8 @@ namespace vk } auto dma_mapping = vk::map_dma(cmd, static_cast(src_address), static_cast(data_length)); + + ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length)); vk::load_dma(::narrow(src_address), data_length); upload_buffer = dma_mapping.second; @@ -927,7 +929,7 @@ namespace vk } // Copy from upload heap to scratch mem - if (!opt.deferred_cmds.empty()) + if (opt.require_upload) { for (const auto& copy_cmd : opt.deferred_cmds) { @@ -953,7 +955,8 @@ namespace vk scratch_offset += image_linear_size; ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory" } - else if (opt.require_upload) + + if (opt.require_upload) { if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first) { @@ -974,7 +977,19 @@ namespace vk { ensure(scratch_buf); - vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); + if (upload_commands.size() > 1) + { + auto range_ptr = buffer_copies.data(); + for (const auto& op : upload_commands) + { + vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr); + range_ptr += op.second; + } + } + else + { + vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); + } insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); @@ -1020,7 +1035,7 @@ namespace vk vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast(copy_regions.size()), copy_regions.data()); } - else if (opt.require_upload) + else if (upload_commands.size() > 1) { auto region_ptr = copy_regions.data(); for (const auto& op : upload_commands)