vulkan: Optimize vertex data upload

- Reuse buffer views as much as possible, vkCreateBufferView is slow on NV Implemented as a large sliding window, reuseable until it is filled
2025-02-06 00:40:11 +00:00 · 2018-02-21 20:50:27 +03:00 · 2018-02-21 20:50:27 +03:00 · 8ccaabb502
commit 8ccaabb502
parent 01349b8cee
8 changed files with 105 additions and 57 deletions
--- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
+++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
@ -131,4 +131,9 @@ public:
 		else
 			fmt::throw_exception("m_put_pos == m_get_pos!" HERE);
 	}
+
+	size_t size() const
+	{
+		return m_size;
+	}
 };
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -1034,7 +1034,7 @@ bool GLGSRender::check_program_state()
 	return (rsx::method_registers.shader_program_address() != 0);
 }

-void GLGSRender::load_program(const vertex_upload_info& upload_info)
+void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
 {
 	get_current_fragment_program(fs_sampler_state);
 	verify(HERE), current_fragment_program.valid;
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -21,6 +21,16 @@ namespace gl
 	using null_vertex_cache = vertex_cache;

 	using shader_cache = rsx::shaders_cache<void*, GLProgramBuffer>;
+
+	struct vertex_upload_info
+	{
+		u32 vertex_draw_count;
+		u32 allocated_vertex_count;
+		u32 vertex_index_base;
+		u32 persistent_mapping_offset;
+		u32 volatile_mapping_offset;
+		std::optional<std::tuple<GLenum, u32> > index_info;
+	};
 }

 struct work_item
@ -255,16 +265,6 @@ struct driver_state
 	}
 };

-struct vertex_upload_info
-{
-	u32 vertex_draw_count;
-	u32 allocated_vertex_count;
-	u32 vertex_index_base;
-	u32 persistent_mapping_offset;
-	u32 volatile_mapping_offset;
-	std::optional<std::tuple<GLenum, u32> > index_info;
-};
-
 class GLGSRender : public GSRender
 {
 private:
@ -340,14 +340,14 @@ private:
 	driver_state gl_state;

 	// Return element to draw and in case of indexed draw index type and offset in index buffer
-	vertex_upload_info set_vertex_buffer();
+	gl::vertex_upload_info set_vertex_buffer();
 	rsx::vertex_input_layout m_vertex_layout = {};

 	void clear_surface(u32 arg);
 	void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false);

 	bool check_program_state();
-	void load_program(const vertex_upload_info& upload_info);
+	void load_program(const gl::vertex_upload_info& upload_info);

 	void update_draw_state();

--- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
@ -180,7 +180,7 @@ namespace
 	};
 }

-vertex_upload_info GLGSRender::set_vertex_buffer()
+gl::vertex_upload_info GLGSRender::set_vertex_buffer()
 {
 	std::chrono::time_point<steady_clock> then = steady_clock::now();

@ -196,7 +196,7 @@ vertex_upload_info GLGSRender::set_vertex_buffer()
 	auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);

 	std::pair<void*, u32> persistent_mapping = {}, volatile_mapping = {};
-	vertex_upload_info upload_info = { result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, 0u, 0u, result.index_info };
+	gl::vertex_upload_info upload_info = { result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, 0u, 0u, result.index_info };

 	if (required.first > 0)
 	{
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -666,6 +666,9 @@ VKGSRender::~VKGSRender()
 	vk::finalize_compiler_context();
 	m_prog_buffer->clear();

+	m_persistent_attribute_storage.reset();
+	m_volatile_attribute_storage.reset();
+
 	//Global resources
 	vk::destroy_global_resources();

@ -1209,10 +1212,12 @@ void VKGSRender::end()

 	//Load program
 	std::chrono::time_point<steady_clock> program_start = textures_end;
-	load_program(std::get<2>(upload_info), std::get<3>(upload_info));
+	load_program(upload_info);

-	m_program->bind_uniform(m_persistent_attribute_storage, "persistent_input_stream", m_current_frame->descriptor_set);
-	m_program->bind_uniform(m_volatile_attribute_storage, "volatile_input_stream", m_current_frame->descriptor_set);
+	VkBufferView persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value;
+	VkBufferView volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value;
+	m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set);
+	m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set);

 	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
 	m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
@ -1445,8 +1450,6 @@ void VKGSRender::end()
 		vkCmdClearAttachments(*m_current_command_buffer, static_cast<u32>(buffers_to_clear.size()), buffers_to_clear.data(), 1, &clear_rect);
 	}

-	std::optional<std::tuple<VkDeviceSize, VkIndexType> > index_info = std::get<4>(upload_info);
-
 	bool primitive_emulated = false;
 	vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, primitive_emulated);

@ -1461,12 +1464,11 @@ void VKGSRender::end()
 		m_occlusion_map[m_active_query_info->driver_handle].command_buffer_to_wait = m_current_command_buffer;
 	}

-	if (!index_info)
+	if (!upload_info.index_info)
 	{
 		if (single_draw)
 		{
-			const auto vertex_count = std::get<1>(upload_info);
-			vkCmdDraw(*m_current_command_buffer, vertex_count, 1, 0, 0);
+			vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0);
 		}
 		else
 		{
@ -1480,10 +1482,10 @@ void VKGSRender::end()
 	else
 	{
 		VkIndexType index_type;
-		u32 index_count = std::get<1>(upload_info);
+		const u32 index_count = upload_info.vertex_draw_count;
 		VkDeviceSize offset;

-		std::tie(offset, index_type) = index_info.value();
+		std::tie(offset, index_type) = upload_info.index_info.value();
 		vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);

 		if (single_draw)
@ -2160,7 +2162,7 @@ bool VKGSRender::check_program_status()
 	return (rsx::method_registers.shader_program_address() != 0);
 }

-void VKGSRender::load_program(u32 vertex_count, u32 vertex_base)
+void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
 {
 	get_current_fragment_program(fs_sampler_state);
 	verify(HERE), current_fragment_program.valid;
@ -2343,11 +2345,13 @@ void VKGSRender::load_program(u32 vertex_count, u32 vertex_base)
 	fill_scale_offset_data(buf, false);
 	fill_user_clip_data(buf + 64);
 	*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
-	*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
+	*(reinterpret_cast<u32*>(buf + 132)) = vertex_info.vertex_index_base;
 	*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.point_size();
 	*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_min();
 	*(reinterpret_cast<f32*>(buf + 144)) = rsx::method_registers.clip_max();
-	fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 160));
+
+	fill_vertex_layout_state(m_vertex_layout, vertex_info.allocated_vertex_count, reinterpret_cast<s32*>(buf + 160),
+			vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);

 	//Vertex constants
 	buf = buf + 512;
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -23,6 +23,17 @@ namespace vk
 	using null_vertex_cache = vertex_cache;

 	using shader_cache = rsx::shaders_cache<vk::pipeline_props, VKProgramBuffer>;
+
+	struct vertex_upload_info
+	{
+		VkPrimitiveTopology primitive;
+		u32 vertex_draw_count;
+		u32 allocated_vertex_count;
+		u32 vertex_index_base;
+		u32 persistent_window_offset;
+		u32 volatile_window_offset;
+		std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
+	};
 }

 //Heap allocation sizes in MB
@ -262,8 +273,8 @@ private:
 	std::array<std::unique_ptr<vk::sampler>, rsx::limits::fragment_textures_count> fs_sampler_handles;
 	std::array<std::unique_ptr<vk::sampler>, rsx::limits::vertex_textures_count> vs_sampler_handles;

-	VkBufferView m_persistent_attribute_storage;
-	VkBufferView m_volatile_attribute_storage;
+	std::unique_ptr<vk::buffer_view> m_persistent_attribute_storage;
+	std::unique_ptr<vk::buffer_view> m_volatile_attribute_storage;

 public:
 	//vk::fbo draw_fbo;
@ -379,11 +390,11 @@ private:

 	void check_heap_status();

-	/// returns primitive topology, index_count, allocated_verts, vertex_base_index, (offset in index buffer, index type)
-	std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
+	vk::vertex_upload_info upload_vertex_data();
+
 public:
 	bool check_program_status();
-	void load_program(u32 vertex_count, u32 vertex_base);
+	void load_program(const vk::vertex_upload_info& vertex_info);
 	void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false);
 	void read_buffers();
 	void write_buffers();
--- a/rpcs3/Emu/RSX/VK/VKHelpers.h
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.h
@ -616,6 +616,25 @@ namespace vk
 		buffer_view(const buffer_view&) = delete;
 		buffer_view(buffer_view&&) = delete;

+		bool in_range(u32 address, u32 size, u32& offset) const
+		{
+			if (address < info.offset)
+				return false;
+
+			const u32 _offset = address - (u32)info.offset;
+			if (info.range < _offset)
+				return false;
+
+			const auto remaining = info.range - _offset;
+			if (size <= remaining)
+			{
+				offset = _offset;
+				return true;
+			}
+
+			return false;
+		}
+
 	private:
 		VkDevice m_device;
 	};
--- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
@ -253,8 +253,7 @@ namespace
 	};
 }

-std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > >
-VKGSRender::upload_vertex_data()
+vk::vertex_upload_info VKGSRender::upload_vertex_data()
 {
 	m_vertex_layout = analyse_inputs_interleaved();

@ -266,11 +265,9 @@ VKGSRender::upload_vertex_data()

 	//Do actual vertex upload
 	auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);
+	u32 persistent_range_base = UINT32_MAX, volatile_range_base = UINT32_MAX;
 	size_t persistent_offset = UINT64_MAX, volatile_offset = UINT64_MAX;

-	m_persistent_attribute_storage = VK_NULL_HANDLE;
-	m_volatile_attribute_storage = VK_NULL_HANDLE;
-
 	if (required.first > 0)
 	{
 		//Check if cacheable
@ -287,8 +284,7 @@ VKGSRender::upload_vertex_data()
 			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
 			{
 				in_cache = true;
-				m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
-					m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, cached->offset_in_heap, required.first));
+				persistent_range_base = cached->offset_in_heap;
 			}
 			else
 			{
@ -299,8 +295,7 @@ VKGSRender::upload_vertex_data()
 		if (!in_cache)
 		{
 			persistent_offset = (u32)m_attrib_ring_info.alloc<256>(required.first);
-			m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
-				m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_offset, required.first));
+			persistent_range_base = (u32)persistent_offset;

 			if (to_store)
 			{
@ -308,25 +303,12 @@ VKGSRender::upload_vertex_data()
 				m_vertex_cache->store_range(storage_address, VK_FORMAT_R8_UINT, required.first, (u32)persistent_offset);
 			}
 		}
-
-		m_persistent_attribute_storage = m_current_frame->buffer_views_to_clean.back()->value;
-	}
-	else
-	{
-		m_persistent_attribute_storage = null_buffer_view->value;
 	}

 	if (required.second > 0)
 	{
 		volatile_offset = (u32)m_attrib_ring_info.alloc<256>(required.second);
-		m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
-			m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_offset, required.second));
-
-		m_volatile_attribute_storage = m_current_frame->buffer_views_to_clean.back()->value;
-	}
-	else
-	{
-		m_volatile_attribute_storage = null_buffer_view->value;
+		volatile_range_base = (u32)volatile_offset;
 	}

 	//Write all the data once if possible
@ -358,5 +340,32 @@ VKGSRender::upload_vertex_data()
 		}
 	}

-	return std::make_tuple(result.native_primitive_type, result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, result.index_info);
+	if (persistent_range_base != UINT32_MAX)
+	{
+		if (!m_persistent_attribute_storage || !m_persistent_attribute_storage->in_range(persistent_range_base, required.first, persistent_range_base))
+		{
+			if (m_persistent_attribute_storage)
+				m_current_frame->buffer_views_to_clean.push_back(std::move(m_persistent_attribute_storage));
+
+			//View 64M blocks at a time (different drivers will only allow a fixed viewable heap size, 64M should be safe)
+			const size_t view_size = (persistent_range_base + 0x4000000) > m_attrib_ring_info.size() ? m_attrib_ring_info.size() - persistent_range_base : 0x4000000;
+			m_persistent_attribute_storage = std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_range_base, view_size);
+			persistent_range_base = 0;
+		}
+	}
+
+	if (volatile_range_base != UINT32_MAX)
+	{
+		if (!m_volatile_attribute_storage || !m_volatile_attribute_storage->in_range(volatile_range_base, required.second, volatile_range_base))
+		{
+			if (m_volatile_attribute_storage)
+				m_current_frame->buffer_views_to_clean.push_back(std::move(m_volatile_attribute_storage));
+
+			const size_t view_size = (volatile_range_base + 0x4000000) > m_attrib_ring_info.size() ? m_attrib_ring_info.size() - volatile_range_base : 0x4000000;
+			m_volatile_attribute_storage = std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_range_base, view_size);
+			volatile_range_base = 0;
+		}
+	}
+
+	return{ result.native_primitive_type, result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, persistent_range_base, volatile_range_base, result.index_info };
 }