gl: Finalize host labels implementation

2025-02-06 18:40:36 +00:00 · 2024-10-22 03:41:36 +03:00 · 2024-10-22 03:41:36 +03:00 · 681debd8f6
commit 681debd8f6
parent 0db06964dc
14 changed files with 166 additions and 65 deletions
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -300,7 +300,7 @@ namespace gl

 			m_src = fmt::replace_all(m_src, syntax_replace);

-			param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
+			param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);
 		}

 		~cs_deswizzle_3d()
--- a/rpcs3/Emu/RSX/GL/GLDMA.cpp
+++ b/rpcs3/Emu/RSX/GL/GLDMA.cpp
@ -19,8 +19,11 @@ namespace gl
 		void* userptr = vm::get_super_ptr(base_address);

 		m_data = std::make_unique<gl::buffer>();
-		m_data->create(buffer::target::userptr, block_size, userptr);
+		m_data->create(buffer::target::array, block_size, userptr, buffer::memory_type::userptr, 0);
 		m_base_address = base_address;
+
+		// Some drivers may reject userptr input for whatever reason. Check that the state is still valid.
+		gl::check_state();
 	}

 	void* dma_block::map(const utils::address_range& range) const
@ -69,8 +72,8 @@ namespace gl

 	utils::address_range to_dma_block_range(u32 start, u32 length)
 	{
-		const auto start_block_address = start & ~s_dma_block_size;
-		const auto end_block_address = (start + length - 1) & ~s_dma_block_size;
+		const auto start_block_address = start & -s_dma_block_size;
+		const auto end_block_address = (start + length + s_dma_block_size - 1) & -s_dma_block_size;
 		return utils::address_range::start_end(start_block_address, end_block_address);
 	}

@ -81,7 +84,7 @@ namespace gl
 		if (!block)
 		{
 			block = std::make_unique<dma_block>();
-			block->allocate(block_range.start, length);
+			block->allocate(block_range.start, block_range.length());
 			return *block;
 		}

@ -96,6 +99,7 @@ namespace gl
 		const auto search_end = (block_range.end + 1);

 		// 1. Resize to new length
+		ensure((new_length & -s_dma_block_size) == new_length);
 		auto new_owner = std::make_unique<dma_block>();
 		new_owner->allocate(owner->base_addr(), new_length);

--- a/rpcs3/Emu/RSX/GL/GLDMA.h
+++ b/rpcs3/Emu/RSX/GL/GLDMA.h
@ -24,7 +24,7 @@ namespace gl
 		void* map(const utils::address_range& range) const;

 		void set_parent(const dma_block* other);
-		const dma_block* head() const { return m_parent; }
+		const dma_block* head() const { return m_parent ? m_parent : this; }
 		bool can_map(const utils::address_range& range) const;

 		u32 base_addr() const { return m_base_address; }
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -181,18 +181,18 @@ void GLGSRender::on_init_thread()
 		backend_config.supports_normalized_barycentrics = false;
 	}

-	if (gl_caps.AMD_pinned_memory)
+	if (gl_caps.AMD_pinned_memory && g_cfg.video.host_label_synchronization)
 	{
 		backend_config.supports_host_gpu_labels = true;

-		if (g_cfg.video.host_label_synchronization)
-		{
-			m_host_gpu_context_data = std::make_unique<gl::buffer>();
-			m_host_gpu_context_data->create(gl::buffer::target::array, 4096);
+		m_host_gpu_context_data = std::make_unique<gl::buffer>();
+		m_host_gpu_context_data->create(gl::buffer::target::array, 4096, nullptr, gl::buffer::memory_type::host_visible,
+			gl::buffer::usage::host_read | gl::buffer::usage::host_write | gl::buffer::usage::persistent_map);

-			auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::read));
-			m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
-		}
+		auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::persistent_rw));
+		m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
+		m_enqueued_host_write_buffer = std::make_unique<gl::scratch_ring_buffer>();
+		m_enqueued_host_write_buffer->create(gl::buffer::target::array, 64 * 0x100000, gl::buffer::usage::dynamic_update);
 	}

 	// Use industry standard resource alignment values as defaults
@ -425,6 +425,7 @@ void GLGSRender::on_exit()

 	m_host_dma_ctrl.reset();
 	m_host_gpu_context_data.reset();
+	m_enqueued_host_write_buffer.reset();

 	for (auto &fbo : m_framebuffer_cache)
 	{
@ -1222,6 +1223,66 @@ void GLGSRender::notify_tile_unbound(u32 tile)
 	}
 }

+bool GLGSRender::release_GCM_label(u32 address, u32 args)
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return false;
+	}
+
+	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
+
+	if (host_ctx->texture_loads_completed())
+	{
+		// We're about to poll waiting for GPU state, ensure the context is still valid.
+		gl::check_state();
+
+		// All texture loads already seen by the host GPU
+		// Wait for all previously submitted labels to be flushed
+		m_host_dma_ctrl->drain_label_queue();
+		return false;
+	}
+
+	const auto mapping = gl::map_dma(address, 4);
+	const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
+	const auto release_event_id = host_ctx->on_label_acquire();
+
+	// We don't have async texture loads yet, so just release both the label and the commands complete
+	u64 write_buf[2] = { write_data, release_event_id };
+	const auto host_read_offset = m_enqueued_host_write_buffer->alloc(16, 16);
+	m_enqueued_host_write_buffer->get().sub_data(host_read_offset, 16, write_buf);
+
+	// Now write to DMA and then to host context
+	m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4);
+	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8);
+	m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
+
+	host_ctx->on_label_release();
+	return true;
+}
+
+void GLGSRender::enqueue_host_context_write(u32 offset, u32 size, const void* data)
+{
+	ensure(size <= 8);
+	const u32 host_read_offset = m_enqueued_host_write_buffer->alloc(8, 16);
+	m_enqueued_host_write_buffer->get().sub_data(host_read_offset, size, data);
+	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset, offset, size);
+	m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
+}
+
+void GLGSRender::on_guest_texture_read()
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return;
+	}
+
+	// Tag the read as being in progress
+	u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter();
+	m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id;
+	enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id);
+}
+
 void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
 	query->result = 0;
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -152,6 +152,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control

 	// Host context for GPU-driven work
 	std::unique_ptr<gl::buffer> m_host_gpu_context_data;
+	std::unique_ptr<gl::scratch_ring_buffer> m_enqueued_host_write_buffer;

 public:
 	u64 get_cycles() final;
@ -196,6 +197,11 @@ public:
 	void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
 	void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;

+	// DMA
+	bool release_GCM_label(u32 address, u32 data) override;
+	void enqueue_host_context_write(u32 offset, u32 size, const void* data);
+	void on_guest_texture_read();
+
 	// GRAPH backend
 	void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;

--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -3,6 +3,7 @@
 #include "GLCompute.h"
 #include "GLRenderTargets.h"
 #include "GLOverlays.h"
+#include "GLGSRender.h"

 #include "glutils/blitter.h"
 #include "glutils/ring_buffer.h"
@ -285,7 +286,7 @@ namespace gl
 			if (!(*dst) || max_mem > static_cast<u64>(dst->size()))
 			{
 				if (*dst) dst->remove();
-				dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
+				dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, 0);
 			}

 			if (auto as_vi = dynamic_cast<const gl::viewable_image*>(src);
@ -400,7 +401,7 @@ namespace gl
 				return;
 			}

-			scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
+			scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, 0);

 			glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
 			src->copy_to(&scratch_mem, in_offset, 0, mem_info->image_size_in_bytes);
@ -835,6 +836,10 @@ namespace gl
 		const GLenum gl_format = std::get<0>(format_type);
 		const GLenum gl_type = std::get<1>(format_type);
 		fill_texture(cmd, dst, gcm_format, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf);
+
+		// Notify the renderer of the upload
+		auto renderer = static_cast<GLGSRender*>(rsx::get_current_renderer());
+		renderer->on_guest_texture_read();
 	}

 	u32 get_format_texel_width(GLenum format)
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -59,7 +59,7 @@ namespace gl
 				pbo.remove();
 			}

-			pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, GL_STREAM_READ);
+			pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, buffer::usage::host_read);
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
 		}

--- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp
@ -3,38 +3,35 @@

 namespace gl
 {
-	void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_flags)
 	{
+		m_memory_type = type;
+
 		if (const auto& caps = get_driver_caps();
-			m_target != target::userptr && caps.ARB_buffer_storage_supported)
+			type != memory_type::userptr && caps.ARB_buffer_storage_supported)
 		{
 			GLenum flags = 0;
-			if (type == memory_type::host_visible)
+			if (usage_flags & usage::host_write)
 			{
-				switch (usage)
-				{
-				case GL_STREAM_DRAW:
-				case GL_STATIC_DRAW:
-				case GL_DYNAMIC_DRAW:
-					flags |= GL_MAP_WRITE_BIT;
-					break;
-				case GL_STREAM_READ:
-				case GL_STATIC_READ:
-				case GL_DYNAMIC_READ:
-					flags |= GL_MAP_READ_BIT;
-					break;
-				default:
-					fmt::throw_exception("Unsupported buffer usage 0x%x", usage);
-				}
+				flags |= GL_MAP_WRITE_BIT;
 			}
-			else
+			if (usage_flags & usage::host_read)
 			{
-				// Local memory hints
-				if (usage == GL_DYNAMIC_COPY)
-				{
-					flags |= GL_DYNAMIC_STORAGE_BIT;
-				}
+				flags |= GL_MAP_READ_BIT;
 			}
+			if (usage_flags & usage::persistent_map)
+			{
+				flags |= GL_MAP_PERSISTENT_BIT;
+			}
+			if (usage_flags & usage::dynamic_update)
+			{
+				flags |= GL_DYNAMIC_STORAGE_BIT;
+			}
+
+			ensure((flags & (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT)) != (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT),
+				"Mutually exclusive usage flags set!");
+
+			ensure(type == memory_type::local || flags != 0, "Host-visible memory must have usage flags set!");

 			if ((flags & GL_MAP_READ_BIT) && !caps.vendor_AMD)
 			{
@ -51,10 +48,8 @@ namespace gl
 		}
 		else
 		{
-			data(size, data_, usage);
+			data(size, data_, GL_STREAM_COPY);
 		}
-
-		m_memory_type = type;
 	}

 	buffer::~buffer()
@ -89,18 +84,18 @@ namespace gl
 		save_binding_state save(current_target(), *this);
 	}

-	void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
 	{
 		create();
-		allocate(size, data_, type, usage);
+		allocate(size, data_, type, usage_bits);
 	}

-	void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
 	{
 		m_target = target_;

 		create();
-		allocate(size, data_, type, usage);
+		allocate(size, data_, type, usage_bits);
 	}

 	void buffer::remove()
@ -117,11 +112,19 @@ namespace gl
 	{
 		ensure(m_memory_type != memory_type::local);

-		DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
 		m_size = size;
+
+		if (m_memory_type == memory_type::userptr)
+		{
+			glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_id);
+			glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, size, data_, usage);
+			return;
+		}
+
+		DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
 	}

-	void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data)
+	void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data)
 	{
 		ensure(m_memory_type == memory_type::local);
 		DSA_CALL2(NamedBufferSubData, m_id, offset, length, data);
--- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.h
+++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.h
@ -15,28 +15,37 @@ namespace gl
 			element_array = GL_ELEMENT_ARRAY_BUFFER,
 			uniform = GL_UNIFORM_BUFFER,
 			texture = GL_TEXTURE_BUFFER,
-			ssbo = GL_SHADER_STORAGE_BUFFER,
-			userptr = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD
+			ssbo = GL_SHADER_STORAGE_BUFFER
 		};

 		enum class access
 		{
 			read = GL_MAP_READ_BIT,
 			write = GL_MAP_WRITE_BIT,
-			read_write = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT
+			rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
+			persistent_rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT
 		};

 		enum class memory_type
 		{
 			undefined = 0,
 			local = 1,
-			host_visible = 2
+			host_visible = 2,
+			userptr = 4
+		};
+
+		enum usage
+		{
+			host_write     = (1 << 0),
+			host_read      = (1 << 1),
+			persistent_map = (1 << 2),
+			dynamic_update = (1 << 3),
 		};

 		class save_binding_state
 		{
-			GLint m_last_binding;
-			GLenum m_target;
+			GLint m_last_binding = GL_ZERO;
+			GLenum m_target = GL_NONE;

 		public:
 			save_binding_state(target target_, const buffer& new_state) : save_binding_state(target_)
@ -65,6 +74,11 @@ namespace gl

 			~save_binding_state()
 			{
+				if (!m_target)
+				{
+					return;
+				}
+
 				glBindBuffer(m_target, m_last_binding);
 			}
 		};
@ -78,7 +92,7 @@ namespace gl
 		// Metadata
 		mutable std::pair<u32, u32> m_bound_range{};

-		void allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage);
+		void allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits);

 	public:
 		buffer() = default;
@ -89,8 +103,8 @@ namespace gl
 		void recreate(GLsizeiptr size, const void* data = nullptr);

 		void create();
-		void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
-		void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
+		void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);
+		void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);

 		void remove();

@ -98,7 +112,7 @@ namespace gl
 		void bind() const { bind(current_target()); }

 		void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW);
-		void sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data);
+		void sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data);

 		GLubyte* map(GLsizeiptr offset, GLsizeiptr length, access access_);
 		void unmap();
--- a/rpcs3/Emu/RSX/GL/glutils/common.h
+++ b/rpcs3/Emu/RSX/GL/glutils/common.h
@ -79,4 +79,12 @@ namespace gl
 	{
 		glInsertEventMarkerEXT(static_cast<GLsizei>(strlen(label)), label);
 	}
+
+	// Checks if GL state is still valid
+	void check_state()
+	{
+		// GL_OUT_OF_MEMORY invalidates the OpenGL context and is actually the GL version of DEVICE_LOST.
+		// This spec workaround allows it to be abused by ISVs to indicate a broken GL context.
+		ensure(glGetError() != GL_OUT_OF_MEMORY);
+	}
 }
--- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
@ -242,14 +242,14 @@ namespace gl
 		}
 	}

-	void scratch_ring_buffer::create(buffer::target target_, u64 size)
+	void scratch_ring_buffer::create(buffer::target target_, u64 size, u32 usage_flags)
 	{
 		if (m_storage)
 		{
 			remove();
 		}

-		m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, GL_STATIC_COPY);
+		m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, usage_flags);
 	}

 	void scratch_ring_buffer::remove()
--- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h
+++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h
@ -103,7 +103,7 @@ namespace gl
 		scratch_ring_buffer(const scratch_ring_buffer&) = delete;
 		~scratch_ring_buffer();

-		void create(buffer::target _target, u64 size);
+		void create(buffer::target _target, u64 size, u32 usage_flags = 0);
 		void remove();

 		u32 alloc(u32 size, u32 alignment);
--- a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
+++ b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
@ -80,7 +80,7 @@ namespace gl
 			if (!m_ubo)
 			{
 				ensure(compiled);
-				m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
+				m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);

 				// Statically bind the image sources
 				m_program.uniforms["InputTexture"] = GL_TEMP_IMAGE_SLOT(0);
--- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
@ -27,7 +27,7 @@ namespace rsx

 		inline bool in_flight_commands_completed() const volatile
 		{
-			return last_label_release2_event == commands_complete_event;
+			return last_label_release2_event <= commands_complete_event;
 		}

 		inline bool texture_loads_completed() const volatile