rsx/vk: Implement flushing surface cache blocks to linear mem

2025-01-30 03:32:55 +00:00 · 2022-08-23 16:34:24 +03:00 · 2022-08-23 16:34:24 +03:00 · 1f9e04f72d
commit 1f9e04f72d
parent a71bdc761e
8 changed files with 339 additions and 125 deletions
--- a/rpcs3/Emu/RSX/Common/surface_cache_dma.hpp
+++ b/rpcs3/Emu/RSX/Common/surface_cache_dma.hpp
@ -0,0 +1,118 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "Utilities/address_range.h"
+
+namespace rsx
+{
+	template <typename Traits, int BlockSize>
+	class surface_cache_dma
+	{
+	protected:
+		static inline u32 block_for(u32 address)
+		{
+			return address / BlockSize;
+		}
+
+		static inline u32 block_address(u32 block_id)
+		{
+			return block_id * BlockSize;
+		}
+
+		using buffer_object_storage_type = typename Traits::buffer_object_storage_type;
+		using buffer_object_type = typename Traits::buffer_object_type;
+
+		struct memory_buffer_entry_t
+		{
+			u32 id;
+			buffer_object_storage_type bo;
+			u64 memory_tag = 0;
+			u32 base_address = 0;
+
+			inline buffer_object_type get() { return Traits::get(bo); }
+			inline operator bool () const { return base_address != 0; }
+
+			inline void release() { bo.release(); }
+			inline void acquire(buffer_object_type b) { bo = b; }
+		};
+
+		using buffer_block_array = typename std::array<memory_buffer_entry_t, 0x100000000ull / BlockSize>;
+		buffer_block_array m_buffer_list;
+
+	public:
+		surface_cache_dma()
+		{
+			for (usz i = 0; i < m_buffer_list.size(); ++i)
+			{
+				m_buffer_list[i].id = i;
+			}
+		}
+
+		surface_cache_dma& with_range(Traits::command_list_type cmd, const utils::address_range& range)
+		{
+			// Prepare underlying memory so that the range specified is provisioned and contiguous
+			// 1. Check if we have a pre-existing bo layer
+			const auto& this_entry = m_buffer_list[block_for(range.start)];
+			if (this_entry)
+			{
+				const auto bo = this_entry.get();
+				const auto buffer_range = utils::address_range::start_length(bo.base_address, ::size32(*bo));
+
+				if (range.inside(buffer_range))
+				{
+					// All is well
+					return *this;
+				} 
+			}
+
+			// Data does not exist or is not contiguous. Merge the layer
+			std::vector<buffer_object_type> bo_list;
+			const auto start_address = this_entry ? this_entry.base_address : block_address(this_entry.id);
+
+			for (u32 address = start_address; address <= range.end;)
+			{
+				auto& bo_storage = m_buffer_list[block_for(address)];
+				bo_storage.base_address = start_address;
+
+				if (auto bo = bo_storage.get())
+				{
+					bo_list.push_back(bo);
+					bo_storage.release();
+					address += ::size32(*bo);
+					continue;
+				}
+
+				bo_list.push_back(nullptr);
+				address += BlockSize;
+			}
+
+			auto unified = Traits::merge_bo_list<BlockSize>(cmd, bo_list);
+			ensure(unified);
+
+			m_buffer_list[block_for(start_address)].acquire(unified);
+			return *this;
+		}
+
+		utils::address_range to_block_range(const utils::address_range& range)
+		{
+			u32 start = block_address(block_for(range.start));
+			u32 end = block_address(block_for(range.end + BlockSize - 1));
+			return utils::address_range::start_end(start, end - 1);
+		}
+
+		std::tuple<buffer_object_type, u32, u64> get(u32 address)
+		{
+			const auto& block = m_buffer_list[block_for(address)];
+			return { block.get(), block.base_address - address };
+		}
+
+		void touch(const utils::address_range& range)
+		{
+			const u64 stamp = rsx::get_shared_tag();
+			for (usz i = block_for(range.start); i <= block_for(range.end); i++)
+			{
+				m_buffer_list[i].memory_tag = stamp;
+			}
+		}
+	};
+}
--- a/rpcs3/Emu/RSX/Common/surface_cache_storage.hpp
+++ b/rpcs3/Emu/RSX/Common/surface_cache_storage.hpp
@ -1,117 +0,0 @@
-#pragma once
-#include "ranged_map.hpp"
-
-namespace rsx
-{
-	template <typename Traits, int BlockSize>
-	class surface_cache_data_map : public ranged_map<typename Traits::surface_storage_type, BlockSize>
-	{
-#ifdef _MSC_VER
-		using super = ranged_map<typename Traits::surface_storage_type, BlockSize>;
-#else
-		using super = class ranged_map<typename Traits::surface_storage_type, BlockSize>;
-#endif
-		using metadata_t = typename super::block_metadata_t;
-
-		const metadata_t& find_head_block(u32 address)
-		{
-			auto& meta = super::m_metadata[address];
-			if (meta.head_block != umax)
-			{
-				return find_head_block(meta.head_block * BlockSize);
-			}
-
-			return meta;
-		}
-
-	public:
-		using buffer_object_storage_type = typename Traits::buffer_object_storage_type;
-		using buffer_object_type = typename Traits::buffer_object_type;
-
-		struct buffer_object_t
-		{
-			buffer_object_storage_type bo;
-			u64 memory_tag = 0;
-
-			inline buffer_object_type get()
-			{
-				return Traits::get(bo);
-			}
-
-			inline void release()
-			{
-				bo.release();
-			}
-
-			inline void acquire(buffer_object_type obj)
-			{
-				ensure(!get());
-				bo = obj;
-			}
-		};
-
-	protected:
-		using buffer_block_array = typename std::array<buffer_object_t, 0x100000000ull / BlockSize>;
-		buffer_block_array m_buffer_list;
-
-	public:
-		surface_cache_data_map()
-			: super::ranged_map()
-		{}
-
-		surface_cache_data_map& with_range(const utils::address_range& range)
-		{
-			// Prepare underlying memory so that the range specified is provisioned and contiguous
-			const auto& head_block = find_head_block(range.start);
-			const auto start_address = block_address(head_block.id);
-
-			const auto& current = m_buffer_list[head_block.id];
-			if (auto bo = current.get())
-			{
-				if (::size32(*bo) >= (range.end - start_address))
-				{
-					return *this;
-				}
-			}
-
-			// Data does not exist or is not contiguous. Merge the layer
-			std::vector<buffer_object_type> bo_list;
-			for (u32 address = start_address; address <= range.end;)
-			{
-				auto& bo_storage = m_buffer_list[super::block_for(address)];
-				if (auto bo = bo_storage.get())
-				{
-					bo_list.push_back(bo);
-					bo_storage.release();
-					address += ::size32(*bo);
-					continue;
-				}
-
-				bo_list.push_back(nullptr);
-				address += BlockSize;
-			}
-
-			auto unified = Traits::merge_bo_list<BlockSize>(bo_list);
-			ensure(unified);
-
-			current.acquire(unified);
-			return *this;
-		}
-
-		void spill(const utils::address_range& range)
-		{
-			// Move VRAM to system RAM
-			const auto& meta = with_range(range).find_head_block(range.start);
-			auto& storage = m_buffer_list[meta.id];
-			Traits::spill_buffer(storage.bo);
-		}
-
-		void unspill(const utils::address_range& range)
-		{
-			// Move system RAM to VRAM
-			const auto& meta = with_range(range).find_head_block(range.start);
-			auto& storage = m_buffer_list[meta.id];
-			Traits::unspill_buffer(storage.bo);
-		}
-	};
-}
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@ -2,7 +2,8 @@

 #include "surface_utils.h"
 #include "simple_array.hpp"
-#include "surface_cache_storage.hpp"
+#include "ranged_map.hpp"
+#include "surface_cache_dma.hpp"
 #include "../gcm_enums.h"
 #include "../rsx_utils.h"
 #include <list>
@ -45,7 +46,8 @@ namespace rsx
 		using surface_type = typename Traits::surface_type;
 		using command_list_type = typename Traits::command_list_type;
 		using surface_overlap_info = surface_overlap_info_t<surface_type>;
-		using surface_ranged_map = surface_cache_data_map<Traits, 0x400000>;
+		using surface_ranged_map = ranged_map<surface_storage_type, 0x400000>;
+		using surface_cache_dma_map = surface_cache_dma<Traits, 0x400000>;

 	protected:
 		surface_ranged_map m_render_targets_storage = {};
@ -54,6 +56,8 @@ namespace rsx
 		rsx::address_range m_render_targets_memory_range;
 		rsx::address_range m_depth_stencil_memory_range;

+		surface_cache_dma_map m_dma_block;
+
 		bool m_invalidate_on_write = false;

 		rsx::surface_raster_type m_active_raster_type = rsx::surface_raster_type::linear;
@ -856,6 +860,94 @@ namespace rsx
 				std::forward<Args>(extra_params)...);
 		}

+		std::tuple<std::vector<surface_type>, std::vector<surface_type>>
+		find_overlapping_set(const utils::address_range& range) const
+		{
+			std::vector<surface_type> color_result, depth_result;
+			utils::address_range result_range;
+
+			if (m_render_targets_memory_range.valid() &&
+				range.overlaps(m_render_targets_memory_range))
+			{
+				for (auto it = m_render_targets_storage.begin_range(range); it != m_render_targets_storage.end(); ++it)
+				{
+					auto surface = Traits::get(it->second);
+					const auto surface_range = surface->get_memory_range();
+					if (!range.overlaps(surface_range))
+						continue;
+
+					color_result.push_back(surface);
+				}
+			}
+
+			if (m_depth_stencil_memory_range.valid() &&
+				range.overlaps(m_depth_stencil_memory_range))
+			{
+				for (auto it = m_depth_stencil_storage.begin_range(range); it != m_depth_stencil_storage.end(); ++it)
+				{
+					auto surface = Traits::get(it->second);
+					const auto surface_range = surface->get_memory_range();
+					if (!range.overlaps(surface_range))
+						continue;
+
+					depth_result.push_back(surface);
+				}
+			}
+
+			return { color_result, depth_result, result_range };
+		}
+
+		void write_to_dma_buffers(
+			command_list_type command_list,
+			const utils::address_range& range)
+		{
+			auto block_range = m_dma_block.to_block_range(range);
+			auto [color_data, depth_stencil_data] = find_overlapping_set(block_range);
+			auto [bo, offset, bo_timestamp] = m_dma_block
+				.with_range(command_list, block_range)
+				.get(block_range.start);
+
+			u64 src_offset, dst_offset, write_length;
+			auto block_length = block_range.length();
+
+			auto all_data = std::move(color_data);
+			all_data.insert(all_data.end(), depth_stencil_data.begin(), depth_stencil_data.end());
+
+			if (all_data.size() > 1)
+			{
+				std::sort(all_data.begin(), all_data.end(), [](const auto& a, const auto& b)
+				{
+					return a->last_use_tag < b->last_use_tag;
+				});
+			}
+
+			for (const auto& surface : all_data)
+			{
+				if (surface->last_use_tag <= bo_timestamp)
+				{
+					continue;
+				}
+
+				const auto this_range = surface->get_memory_range();
+				const auto max_length = this_range.length();
+				if (this_range.start < block_range.start)
+				{
+					src_offset = block_range.start - this_range.start;
+					dst_offset = 0;
+				}
+				else
+				{
+					src_offset = 0;
+					dst_offset = this_range.start - block_range.start;
+				}
+
+				write_length = std::min(max_length, block_length - dst_offset);
+				Traits::write_render_target_to_memory(command_list, bo, surface, dst_offset, src_offset, write_length);
+			}
+
+			m_dma_block.touch(block_range);
+		}
+
 	public:
 		/**
 		 * Update bound color and depth surface.
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h
@ -359,7 +359,18 @@ struct gl_render_target_traits
 	}

 	static
-	gl::buffer* merge_bo_list(const std::vector<gl::buffer*>& /*list*/)
+	void write_render_target_to_memory(
+		gl::command_context&,
+		gl::buffer*,
+		gl::render_target*,
+		u64, u64, u64)
+	{
+		// TODO
+	}
+
+	template <int BlockSize>
+	static
+	gl::buffer* merge_bo_list(gl::command_context&, const std::vector<gl::buffer*>& /*list*/)
 	{
 		// TODO
 		return nullptr;
--- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp
@ -3,6 +3,15 @@

 namespace vk
 {
+	namespace surface_cache_utils
+	{
+		void dispose(vk::buffer* buf)
+		{
+			auto obj = vk::disposable_t::make(buf);
+			vk::get_resource_manager()->dispose(obj);
+		}
+	}
+
 	void surface_cache::destroy()
 	{
 		invalidate_all();
--- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h
+++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
@ -16,6 +16,11 @@

 namespace vk
 {
+	namespace surface_cache_utils
+	{
+		void dispose(vk::buffer* buf);
+	}
+
 	void resolve_image(vk::command_buffer& cmd, vk::viewable_image* dst, vk::viewable_image* src);
 	void unresolve_image(vk::command_buffer& cmd, vk::viewable_image* dst, vk::viewable_image* src);

@ -463,10 +468,106 @@ namespace vk
 			// TODO
 		}

-		static vk::buffer* merge_bo_list(const std::vector<vk::buffer*>& /*list*/)
+		static void write_render_target_to_memory(
+			vk::command_buffer& cmd,
+			vk::buffer* bo,
+			vk::render_target* surface,
+			u64 dst_offset_in_buffer,
+			u64 src_offset_in_buffer,
+			u64 max_copy_length)
 		{
-			// TODO
-			return nullptr;
+			surface->read_barrier(cmd);
+			vk::image* source = surface->get_surface(rsx::surface_access::transfer_read);
+			const bool is_scaled = surface->width() != surface->surface_width;
+			if (is_scaled)
+			{
+				const areai src_rect = { 0, 0, source->width(), source->height() };
+				const areai dst_rect = { 0, 0, surface->get_surface_width<rsx::surface_metrics::samples>(), surface->get_surface_height<rsx::surface_metrics::samples>() };
+
+				auto scratch = vk::get_typeless_helper(source->format(), source->format_class(), dst_rect.x2, dst_rect.y2);
+				vk::copy_scaled_image(cmd, source, scratch, src_rect, dst_rect, 1, true, VK_FILTER_NEAREST);
+
+				source = scratch;
+			}
+
+			auto dest = bo;
+			const auto transfer_size = surface->get_memory_range().length();
+			if (transfer_size > max_copy_length || src_offset_in_buffer || surface->is_depth_surface())
+			{
+				auto scratch = vk::get_scratch_buffer(cmd, transfer_size * 4);
+				dest = scratch;
+			}
+
+			VkBufferImageCopy region =
+			{
+				.bufferOffset = (dest == bo) ? dst_offset_in_buffer : 0,
+				.bufferRowLength = surface->rsx_pitch / surface->get_bpp(),
+				.bufferImageHeight = 0,
+				.imageSubresource = { source->aspect(), 0, 0, 1 },
+				.imageOffset = {},
+				.imageExtent = {
+					.width = source->width(),
+					.height = source->height(),
+					.depth = 1
+				}
+			};
+
+			vk::copy_image_to_buffer(cmd, source, dest, region);
+			vk::insert_buffer_memory_barrier(cmd,
+				dest->value, src_offset_in_buffer, max_copy_length,
+				VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+
+			if (dest != bo)
+			{
+				VkBufferCopy copy = { src_offset_in_buffer, dst_offset_in_buffer, max_copy_length };
+				vkCmdCopyBuffer(cmd, dest->value, bo->value, 1, &copy);
+
+				vk::insert_buffer_memory_barrier(cmd,
+					bo->value, dst_offset_in_buffer, max_copy_length,
+					VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+					VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+			}
+		}
+
+		template <int BlockSize>
+		static vk::buffer* merge_bo_list(vk::command_buffer& cmd, std::vector<vk::buffer*>& list)
+		{
+			u32 required_bo_size = 0;
+			for (auto& bo : list)
+			{
+				required_bo_size += (bo ? bo->size() : BlockSize);
+			}
+
+			// Create dst
+			auto pdev = cmd.get_command_pool().owner;
+			auto dst = new vk::buffer(*pdev,
+				required_bo_size,
+				pdev->get_memory_mapping().device_local, 0,
+				VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+				0, VMM_ALLOCATION_POOL_SURFACE_CACHE);
+
+			// TODO: Initialize the buffer with system RAM contents
+
+			// Copy all the data over from the sub-blocks
+			u32 offset = 0;
+			for (auto& bo : list)
+			{
+				if (!bo)
+				{
+					offset += BlockSize;
+					continue;
+				}
+
+				VkBufferCopy copy = { 0, offset, ::size32(*bo) };
+				offset += ::size32(*bo);
+				vkCmdCopyBuffer(cmd, bo->value, dst->value, 1, &copy);
+
+				// Cleanup
+				vk::surface_cache_utils::dispose(bo);
+			}
+
+			return dst;
 		}

 		template <typename T>
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@ -516,7 +516,7 @@
    <ClInclude Include="Emu\RSX\Common\profiling_timer.hpp" />
    <ClInclude Include="Emu\RSX\Common\ranged_map.hpp" />
    <ClInclude Include="Emu\RSX\Common\simple_array.hpp" />
-    <ClInclude Include="Emu\RSX\Common\surface_cache_storage.hpp" />
+    <ClInclude Include="Emu\RSX\Common\surface_cache_dma.hpp" />
    <ClInclude Include="Emu\RSX\Common\time.hpp" />
    <ClInclude Include="Emu\RSX\Overlays\overlay_cursor.h" />
    <ClInclude Include="Emu\RSX\Overlays\overlay_edit_text.hpp" />
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@ -2143,7 +2143,7 @@
    <ClInclude Include="Emu\RSX\Common\ranged_map.hpp">
      <Filter>Emu\GPU\RSX\Common</Filter>
    </ClInclude>
-    <ClInclude Include="Emu\RSX\Common\surface_cache_storage.hpp">
+    <ClInclude Include="Emu\RSX\Common\surface_cache_dma.hpp">
      <Filter>Emu\GPU\RSX\Common</Filter>
    </ClInclude>
    <ClInclude Include="Emu\CPU\sse2neon.h">