vk/dma: Allow interoperability between pass-through and write-back DMA caching types

2024-11-17 08:11:51 +00:00 · 2021-01-19 00:40:56 +03:00 · 2021-01-19 00:40:56 +03:00 · 67949bb5b7
commit 67949bb5b7
parent e56da4eb46
4 changed files with 123 additions and 144 deletions
--- a/rpcs3/Emu/RSX/VK/VKDMA.cpp
+++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp
@ -12,16 +12,15 @@ namespace vk
 {
 	static constexpr usz s_dma_block_length = 0x00010000;
 	static constexpr u32 s_dma_block_mask   = 0xFFFF0000;
-	//static constexpr u32 s_dma_offset_mask  = 0x0000FFFF;
-
-	static constexpr u32 s_page_size = 65536;
-	static constexpr u32 s_page_align = s_page_size - 1;
-	static constexpr u32 s_pages_per_entry = 32;
-	static constexpr u32 s_bits_per_page = 2;
-	static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);

 	std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;

+	dma_block::~dma_block()
+	{
+		// Use safe free (uses gc to clean up)
+		free();
+	}
+
 	void* dma_block::map_range(const utils::address_range& range)
 	{
 		if (inheritance_info.parent)
@ -49,19 +48,24 @@ namespace vk

 	void dma_block::allocate(const render_device& dev, usz size)
 	{
-		if (allocated_memory)
-		{
-			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
-			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			auto gc = vk::get_resource_manager();
-			gc->dispose(allocated_memory);
-		}
+		// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
+		// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
+		free();

 		allocated_memory = std::make_unique<vk::buffer>(dev, size,
 			dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
 			VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
 	}

+	void dma_block::free()
+	{
+		if (allocated_memory)
+		{
+			auto gc = vk::get_resource_manager();
+			gc->dispose(allocated_memory);
+		}
+	}
+
 	void dma_block::init(const render_device& dev, u32 addr, usz size)
 	{
 		ensure(size);
@ -69,7 +73,6 @@ namespace vk
 		base_address = addr;

 		allocate(dev, size);
-		page_info.resize(size / s_bytes_per_entry, ~0ull);
 	}

 	void dma_block::init(dma_block* parent, u32 addr, usz size)
@ -79,67 +82,6 @@ namespace vk
 		inheritance_info.block_offset = (addr - parent->base_address);
 	}

-	void dma_block::set_page_bit(u32 offset, u64 bits)
-	{
-		const auto entry = (offset / s_bytes_per_entry);
-		const auto word = entry / s_pages_per_entry;
-		const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
-
-		page_info[word] &= ~(3 << shift);
-		page_info[word] |= (bits << shift);
-	}
-
-	bool dma_block::test_page_bit(u32 offset, u64 bits)
-	{
-		const auto entry = (offset / s_bytes_per_entry);
-		const auto word = entry / s_pages_per_entry;
-		const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
-
-		return !!(page_info[word] & (bits << shift));
-	}
-
-	void dma_block::mark_dirty(const utils::address_range& range)
-	{
-		if (!inheritance_info.parent)
-		{
-			const u32 start = utils::align(range.start, s_page_size);
-			const u32 end = ((range.end + 1) & s_page_align);
-
-			for (u32 page = start; page < end; page += s_page_size)
-			{
-				set_page_bit(page - base_address, page_bits::dirty);
-			}
-
-			if (start > range.start) [[unlikely]]
-			{
-				set_page_bit(start - s_page_size, page_bits::nocache);
-			}
-
-			if (end < range.end) [[unlikely]]
-			{
-				set_page_bit(end + s_page_size, page_bits::nocache);
-			}
-		}
-		else
-		{
-			inheritance_info.parent->mark_dirty(range);
-		}
-	}
-
-	void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
-	{
-		if (!inheritance_info.parent)
-		{
-			auto bit_offset = page_offset / s_bytes_per_entry;
-			ensure(bit_offset + bits.size() <= page_info.size());
-			std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
-		}
-		else
-		{
-			inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
-		}
-	}
-
 	void dma_block::flush(const utils::address_range& range)
 	{
 		auto src = map_range(range);
@ -206,11 +148,10 @@ namespace vk
 		{
 			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
 			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			auto gc = vk::get_resource_manager();
-			gc->dispose(allocated_memory);
+			free();

-			parent->set_page_info(inheritance_info.block_offset, page_info);
-			page_info.clear();
+			//parent->set_page_info(inheritance_info.block_offset, page_info);
+			//page_info.clear();
 		}
 	}

@ -222,8 +163,8 @@ namespace vk

 		allocate(dev, new_size);

-		const auto required_entries = new_size / s_bytes_per_entry;
-		page_info.resize(required_entries, ~0ull);
+		//const auto required_entries = new_size / s_bytes_per_entry;
+		//page_info.resize(required_entries, ~0ull);
 	}

 	u32 dma_block::start() const
@ -244,13 +185,9 @@ namespace vk

 	void dma_block_EXT::allocate(const render_device& dev, usz size)
 	{
-		if (allocated_memory)
-		{
-			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
-			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			auto gc = vk::get_resource_manager();
-			gc->dispose(allocated_memory);
-		}
+		// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
+		// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
+		free();

 		allocated_memory = std::make_unique<vk::buffer>(dev,
 			VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
@ -278,16 +215,53 @@ namespace vk
 		// NOP
 	}

-	void create_dma_block(std::unique_ptr<dma_block>& block)
+	bool test_host_pointer(u32 base_address, usz length)
 	{
+#if 0 // Unusable due to vm locks
+		auto block = vm::get(vm::any, base_address);
+		ensure(block);
+
+		if ((block->addr + block->size) < (base_address + length))
+		{
+			return false;
+		}
+
+		if (block->flags & 0x120)
+		{
+			return true;
+		}
+
+		auto range_info = block->peek(base_address, u32(length));
+		return !!range_info.second;
+#endif
+
 #ifdef _WIN32
-		const bool allow_host_buffers = true;
+		MEMORY_BASIC_INFORMATION mem_info;
+		if (!::VirtualQuery(vm::get_super_ptr<const void>(base_address), &mem_info, sizeof(mem_info)))
+		{
+			rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError());
+			return false;
+		}
+
+		return (mem_info.RegionSize >= length);
 #else
-		// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
+		return true; // *nix behavior is unknown with NVIDIA drivers
+#endif
+	}
+
+	void create_dma_block(std::unique_ptr<dma_block>& block, u32 base_address, u32 expected_length)
+	{
 		const auto vendor = g_render_device->gpu().get_driver_vendor();
+
+#ifdef _WIN32
+		const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ?
+			test_host_pointer(base_address, expected_length) :
+			true;
+#else
+		// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations		
 		const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV);
 #endif
-		if (g_render_device->get_external_memory_host_support() && allow_host_buffers)
+		if (allow_host_buffers && g_render_device->get_external_memory_host_support())
 		{
 			block.reset(new dma_block_EXT());
 		}
@ -295,6 +269,8 @@ namespace vk
 		{
 			block.reset(new dma_block());
 		}
+
+		block->init(*g_render_device, base_address, expected_length);
 	}

 	std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
@ -315,25 +291,28 @@ namespace vk
 		if (first_block == last_block) [[likely]]
 		{
 			auto &block_info = g_dma_pool[first_block];
-			if (!block_info) create_dma_block(block_info);
-
-			block_info->init(*g_render_device, first_block, s_dma_block_length);
+			ensure(!block_info);
+			create_dma_block(block_info, first_block, s_dma_block_length);
 			return block_info->get(map_range);
 		}

 		dma_block* block_head = nullptr;
 		auto block_end = utils::align(limit, s_dma_block_length);

-		// Reverse scan to try and find the minimum required length in case of other chaining
-		for (auto block = last_block; block != first_block; block -= s_dma_block_length)
+		if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA ||
+			rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL)
 		{
-			if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
+			// Reverse scan to try and find the minimum required length in case of other chaining
+			for (auto block = last_block; block != first_block; block -= s_dma_block_length)
 			{
-				const auto end = found->second->end();
-				last_block = std::max(last_block, end & s_dma_block_mask);
-				block_end = std::max(block_end, end + 1);
+				if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
+				{
+					const auto end = found->second->end();
+					last_block = std::max(last_block, end & s_dma_block_mask);
+					block_end = std::max(block_end, end + 1);

-				break;
+					break;
+				}
 			}
 		}

@ -342,37 +321,31 @@ namespace vk
 			auto found = g_dma_pool.find(block);
 			auto &entry = g_dma_pool[block];

-			const bool exists = !!entry;
-			if (!exists) create_dma_block(entry);
-
 			if (block == first_block)
 			{
-				block_head = entry->head();
-
-				if (exists)
+				if (entry && entry->end() < limit)
 				{
-					if (entry->end() < limit)
-					{
-						auto new_length = block_end - block_head->start();
-						block_head->extend(cmd, *g_render_device, new_length);
-					}
+					// Then the references to this object do not go to the end of the list as will be done with this new allocation.
+					// A dumb release is therefore safe...
+					entry.reset();
 				}
-				else
+
+				if (!entry)
 				{
 					auto required_size = (block_end - block);
-					block_head->init(*g_render_device, block, required_size);
+					create_dma_block(entry, block, required_size);
 				}
+
+				block_head = entry->head();
+			}
+			else if (entry)
+			{
+				entry->set_parent(cmd, block_head);
 			}
 			else
 			{
-				if (exists)
-				{
-					entry->set_parent(cmd, block_head);
-				}
-				else
-				{
-					entry->init(block_head, block, s_dma_block_length);
-				}
+				entry.reset(new dma_block());
+				entry->init(block_head, block, s_dma_block_length);
 			}
 		}

--- a/rpcs3/Emu/RSX/VK/VKDMA.h
+++ b/rpcs3/Emu/RSX/VK/VKDMA.h
@ -13,13 +13,6 @@ namespace vk
 	class dma_block
 	{
 	protected:
-		enum page_bits
-		{
-			synchronized = 0,
-			dirty = 1,
-			nocache = 3
-		};
-
 		struct
 		{
 			dma_block* parent = nullptr;
@ -29,19 +22,17 @@ namespace vk

 		u32 base_address = 0;
 		std::unique_ptr<buffer> allocated_memory;
-		std::vector<u64> page_info;

 		virtual void allocate(const render_device& dev, usz size);
+		virtual void free();
 		virtual void* map_range(const utils::address_range& range);
 		virtual void unmap();

-		void set_page_bit(u32 page, u64 bits);
-		bool test_page_bit(u32 page, u64 bits);
-		void mark_dirty(const utils::address_range& range);
-		void set_page_info(u32 page_offset, const std::vector<u64>& bits);
-
 	public:

+		dma_block() = default;
+		virtual ~dma_block();
+
 		virtual void init(const render_device& dev, u32 addr, usz size);
 		virtual void init(dma_block* parent, u32 addr, usz size);
 		virtual void flush(const utils::address_range& range);
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@ -70,7 +70,6 @@ namespace vk
 		vk::clear_resolve_helpers();
 		vk::clear_dma_resources();
 		vk::vmm_reset();
-		vk::get_resource_manager()->destroy();
 		vk::clear_scratch_resources();

 		vk::get_upload_heap()->destroy();
@ -86,6 +85,9 @@ namespace vk
 			p.second->destroy();
 		}
 		g_overlay_passes.clear();
+
+		// This must be the last item destroyed
+		vk::get_resource_manager()->destroy();
 	}

 	const vk::render_device *get_current_renderer()
@ -263,8 +265,6 @@ namespace vk
 		return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0;
 	}

-
-
 	void do_query_cleanup(vk::command_buffer& cmd)
 	{
 		auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer());
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -905,6 +905,8 @@ namespace vk
 				}

 				auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
+
+				ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length));
 				vk::load_dma(::narrow<u32>(src_address), data_length);

 				upload_buffer = dma_mapping.second;
@ -927,7 +929,7 @@ namespace vk
 				}

 				// Copy from upload heap to scratch mem
-				if (!opt.deferred_cmds.empty())
+				if (opt.require_upload)
 				{
 					for (const auto& copy_cmd : opt.deferred_cmds)
 					{
@ -953,7 +955,8 @@ namespace vk
 				scratch_offset += image_linear_size;
 				ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
 			}
-			else if (opt.require_upload)
+
+			if (opt.require_upload)
 			{
 				if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first)
 				{
@ -974,7 +977,19 @@ namespace vk
 		{
 			ensure(scratch_buf);

-			vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
+			if (upload_commands.size() > 1)
+			{
+				auto range_ptr = buffer_copies.data();
+				for (const auto& op : upload_commands)
+				{
+					vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr);
+					range_ptr += op.second;
+				}
+			}
+			else
+			{
+				vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
+			}

 			insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
@ -1020,7 +1035,7 @@ namespace vk

 			vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
 		}
-		else if (opt.require_upload)
+		else if (upload_commands.size() > 1)
 		{
 			auto region_ptr = copy_regions.data();
 			for (const auto& op : upload_commands)