From 00b0311c867e4efc97e3947904b1f99296bfbc64 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Sat, 5 Aug 2017 00:11:14 +0300
Subject: [PATCH] rsx/gl/vulkan: Refactoring and partial vulkan rewrite -
 Updates vulkan to use GPU vertex processing - Rewrites vulkan to buffer
 entire frames and present when first available to avoid stalls - Move more
 state into dynamic descriptors to reduce progam cache misses; Fix render pass
 conflicts before texture access - Discards incomplete cb at destruction to
 avoid refs to destroyed objects - Move set_viewport to the uninterruptible
 block before drawing in case cb is switched before we're ready - Manage frame
 contexts separately for easier async frame management - Avoid wasteful
 create-destroy cycles when sampling rtts

---
 rpcs3/Emu/RSX/Common/GLSLCommon.h    |  16 +-
 rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp |  10 +-
 rpcs3/Emu/RSX/GL/GLVertexProgram.cpp |   7 +-
 rpcs3/Emu/RSX/VK/VKGSRender.cpp      | 515 +++++++++++++----------
 rpcs3/Emu/RSX/VK/VKGSRender.h        |  63 +--
 rpcs3/Emu/RSX/VK/VKProgramBuffer.h   |  26 +-
 rpcs3/Emu/RSX/VK/VKRenderTargets.h   |  10 +
 rpcs3/Emu/RSX/VK/VKTextOut.h         |   3 +
 rpcs3/Emu/RSX/VK/VKTextureCache.h    |  27 +-
 rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp | 590 +++++++--------------------
 rpcs3/Emu/RSX/VK/VKVertexProgram.cpp | 127 ++----
 11 files changed, 592 insertions(+), 802 deletions(-)

diff --git a/rpcs3/Emu/RSX/Common/GLSLCommon.h b/rpcs3/Emu/RSX/Common/GLSLCommon.h
index 0737decd6c..6b6942494d 100644
--- a/rpcs3/Emu/RSX/Common/GLSLCommon.h
+++ b/rpcs3/Emu/RSX/Common/GLSLCommon.h
@@ -11,6 +11,12 @@ namespace glsl
 		glsl_fragment_program = 1
 	};
 
+	enum glsl_rules
+	{
+		glsl_rules_opengl4,
+		glsl_rules_rpirv
+	};
+
 	static std::string getFloatTypeNameImpl(size_t elementCount)
 	{
 		switch (elementCount)
@@ -48,8 +54,10 @@ namespace glsl
 		fmt::throw_exception("Unknown compare function" HERE);
 	}
 
-	static void insert_vertex_input_fetch(std::stringstream& OS)
+	static void insert_vertex_input_fetch(std::stringstream& OS, glsl_rules rules)
 	{
+		std::string vertex_id_name = (rules == glsl_rules_opengl4) ? "gl_VertexID" : "gl_VertexIndex";
+
 		//Actually decode a vertex attribute from a raw byte stream
 		OS << "struct attribute_desc\n";
 		OS << "{\n";
@@ -194,16 +202,16 @@ namespace glsl
 		OS << "{\n";
 		OS << "	attribute_desc desc = fetch_desc(location);\n";
 		OS << "\n";
-		OS << "	int vertex_id = gl_VertexID - int(vertex_base_index);\n";
+		OS << "	int vertex_id = " << vertex_id_name << " - int(vertex_base_index);\n";
 		OS << "	if (desc.frequency == 0)\n";
 		OS << "		vertex_id = 0;\n";
 		OS << "	else if (desc.frequency > 1)\n";
 		OS << "	{\n";
 		OS << "		//if a vertex modifier is active; vertex_base must be 0 and is ignored\n";
 		OS << "		if (desc.modulo != 0)\n";
-		OS << "			vertex_id = gl_VertexID % desc.divisor;\n";
+		OS << "			vertex_id = " << vertex_id_name << " % desc.divisor;\n";
 		OS << "		else\n";
-		OS << "			vertex_id = gl_VertexID / desc.divisor;\n";
+		OS << "			vertex_id = " << vertex_id_name << " / desc.divisor;\n";
 		OS << "	}\n";
 		OS << "\n";
 		OS << "	if (desc.is_volatile != 0)\n";
diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
index 353b9941af..164ae7934d 100644
--- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
@@ -92,9 +92,8 @@ namespace
 
 		vertex_input_state operator()(const rsx::draw_array_command& command)
 		{
-			u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
-			u32 min_index    = rsx::method_registers.current_draw_clause.first_count_commands.front().first;
-			u32 max_index    = vertex_count - 1 + min_index;
+			const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
+			const u32 min_index    = rsx::method_registers.current_draw_clause.first_count_commands.front().first;
 
 			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
 			{
@@ -153,7 +152,7 @@ namespace
 
 		vertex_input_state operator()(const rsx::draw_inlined_array& command)
 		{
-			u32 vertex_count = (u32)command.inline_vertex_array.size() * sizeof(u32) / m_vertex_layout.interleaved_blocks[0].attribute_stride;
+			const u32 vertex_count = (u32)command.inline_vertex_array.size() * sizeof(u32) / m_vertex_layout.interleaved_blocks[0].attribute_stride;
 
 			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
 			{
@@ -188,8 +187,7 @@ std::tuple<u32, u32, u32, std::optional<std::tuple<GLenum, u32>>> GLGSRender::se
 	auto &vertex_base = result.vertex_data_base;
 
 	//Do actual vertex upload
-	auto &required = calculate_memory_requirements(m_vertex_layout, vertex_count);
-
+	auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);
 
 	std::pair<void*, u32> persistent_mapping = {}, volatile_mapping = {};
 
diff --git a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
index 44e64ddd91..3035234287 100644
--- a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
@@ -149,15 +149,10 @@ void GLVertexDecompilerThread::insertOutputs(std::stringstream & OS, const std::
 		OS << "out vec4 front_spec_color;\n";
 }
 
-namespace
-{
-
-}
-
 void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 {
 	insert_glsl_legacy_function(OS, glsl::glsl_vertex_program);
-	glsl::insert_vertex_input_fetch(OS);
+	glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4);
 
 	std::string parameters = "";
 	for (int i = 0; i < 16; ++i)
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 6e69a30e24..abb10b96b8 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -564,27 +564,11 @@ VKGSRender::VKGSRender() : GSRender()
 
 	m_current_command_buffer = &m_primary_cb_list[0];
 	
-	//Create secondar command_buffer for parallel operations
+	//Create secondary command_buffer for parallel operations
 	m_secondary_command_buffer_pool.create((*m_device));
 	m_secondary_command_buffer.create(m_secondary_command_buffer_pool);
 	
-	open_command_buffer();
-
-	for (u32 i = 0; i < m_swap_chain->get_swap_image_count(); ++i)
-	{
-		vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i),
-								VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
-								vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT));
-
-		VkClearColorValue clear_color{};
-		auto range = vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT);
-		vkCmdClearColorImage(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i), VK_IMAGE_LAYOUT_GENERAL, &clear_color, 1, &range);
-		vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i),
-			VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
-			vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT));
-
-	}
-
+	//VRAM allocation
 	m_attrib_ring_info.init(VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000);
 	m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0));
 	m_uniform_buffer_ring_info.init(VK_UBO_RING_BUFFER_SIZE_M * 0x100000);
@@ -598,25 +582,25 @@ VKGSRender::VKGSRender() : GSRender()
 
 	std::tie(pipeline_layout, descriptor_layouts) = get_shared_pipeline_layout(*m_device);
 
+	//Generate frame contexts
 	VkDescriptorPoolSize uniform_buffer_pool = { VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 3 * DESCRIPTOR_MAX_DRAW_CALLS };
 	VkDescriptorPoolSize uniform_texel_pool = { VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER , 16 * DESCRIPTOR_MAX_DRAW_CALLS };
 	VkDescriptorPoolSize texture_pool = { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , 20 * DESCRIPTOR_MAX_DRAW_CALLS };
 
 	std::vector<VkDescriptorPoolSize> sizes{ uniform_buffer_pool, uniform_texel_pool, texture_pool };
 
-	descriptor_pool.create(*m_device, sizes.data(), static_cast<uint32_t>(sizes.size()));
-
-
-	null_buffer = std::make_unique<vk::buffer>(*m_device, 32, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0);
-	null_buffer_view = std::make_unique<vk::buffer_view>(*m_device, null_buffer->value, VK_FORMAT_R32_SFLOAT, 0, 32);
-
-	VkFenceCreateInfo fence_info = {};
-	fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-
 	VkSemaphoreCreateInfo semaphore_info = {};
 	semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
 
-	vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &m_present_semaphore);
+	for (auto &ctx : frame_context)
+	{
+		ctx = {};
+		vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.present_semaphore);
+		ctx.descriptor_pool.create(*m_device, sizes.data(), static_cast<uint32_t>(sizes.size()));
+	}
+
+	null_buffer = std::make_unique<vk::buffer>(*m_device, 32, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0);
+	null_buffer_view = std::make_unique<vk::buffer_view>(*m_device, null_buffer->value, VK_FORMAT_R32_SFLOAT, 0, 32);
 
 	vk::initialize_compiler_context();
 
@@ -631,6 +615,25 @@ VKGSRender::VKGSRender() : GSRender()
 		m_vertex_cache.reset(new vk::null_vertex_cache());
 	else
 		m_vertex_cache.reset(new vk::weak_vertex_cache());
+
+	open_command_buffer();
+
+	for (u32 i = 0; i < m_swap_chain->get_swap_image_count(); ++i)
+	{
+		vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i),
+			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
+			vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT));
+
+		VkClearColorValue clear_color{};
+		auto range = vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT);
+		vkCmdClearColorImage(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i), VK_IMAGE_LAYOUT_GENERAL, &clear_color, 1, &range);
+		vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(i),
+			VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+			vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT));
+
+	}
+
+	m_current_frame = &frame_context[0];
 }
 
 VKGSRender::~VKGSRender()
@@ -641,23 +644,9 @@ VKGSRender::~VKGSRender()
 		return;
 	}
 
-	//Close recording and wait for all to finish
-	close_render_pass();
-	CHECK_RESULT(vkEndCommandBuffer(*m_current_command_buffer));
-
-	for (auto &cb : m_primary_cb_list)
-		if (cb.pending) cb.wait();
-
 	//Wait for device to finish up with resources
 	vkDeviceWaitIdle(*m_device);
 
-	//Sync objects
-	if (m_present_semaphore)
-	{
-		vkDestroySemaphore((*m_device), m_present_semaphore, nullptr);
-		m_present_semaphore = nullptr;
-	}
-
 	//Texture cache
 	m_texture_cache.destroy();
 
@@ -678,10 +667,17 @@ VKGSRender::~VKGSRender()
 	null_buffer.reset();
 	null_buffer_view.reset();
 
-	//Temporary objects
-	m_buffer_view_to_clean.clear();
-	m_sampler_to_clean.clear();
-	m_framebuffer_to_clean.clear();
+	//Frame context
+	for (auto &ctx : frame_context)
+	{
+		vkDestroySemaphore((*m_device), ctx.present_semaphore, nullptr);
+		ctx.descriptor_pool.destroy();
+
+		ctx.buffer_views_to_clean.clear();
+		ctx.samplers_to_clean.clear();
+		ctx.framebuffers_to_clean.clear();
+	}
+
 	m_draw_fbo.reset();
 
 	//Render passes
@@ -699,8 +695,6 @@ VKGSRender::~VKGSRender()
 	vkDestroyPipelineLayout(*m_device, pipeline_layout, nullptr);
 	vkDestroyDescriptorSetLayout(*m_device, descriptor_layouts, nullptr);
 
-	descriptor_pool.destroy();
-
 	//Command buffer
 	for (auto &cb : m_primary_cb_list)
 		cb.destroy();
@@ -736,8 +730,6 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 			if (!flushable)
 				return false;
 
-			close_render_pass();
-
 			if (synchronized)
 			{
 				if (m_last_flushable_cb >= 0)
@@ -807,8 +799,21 @@ void VKGSRender::begin()
 		return;
 
 	//Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources
-	if (m_used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS ||
-		m_attrib_ring_info.is_critical() ||
+	if (m_current_frame->used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS)
+	{
+		//No need to stall if we have more than one frame queue anyway
+		flush_command_queue();
+		
+		CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0));
+		m_current_frame->used_descriptors = 0;
+
+		m_uniform_buffer_ring_info.reset_allocation_stats();
+		m_index_buffer_ring_info.reset_allocation_stats();
+		m_attrib_ring_info.reset_allocation_stats();
+		m_texture_upload_buffer_ring_info.reset_allocation_stats();
+	}
+
+	if (m_attrib_ring_info.is_critical() ||
 		m_texture_upload_buffer_ring_info.is_critical() ||
 		m_uniform_buffer_ring_info.is_critical() ||
 		m_index_buffer_ring_info.is_critical())
@@ -818,20 +823,17 @@ void VKGSRender::begin()
 		flush_command_queue(true);
 		m_vertex_cache->purge();
 
-		CHECK_RESULT(vkResetDescriptorPool(*m_device, descriptor_pool, 0));
-		m_used_descriptors = 0;
-
-		m_uniform_buffer_ring_info.reset_allocation_stats();
-		m_index_buffer_ring_info.reset_allocation_stats();
-		m_attrib_ring_info.reset_allocation_stats();
-		m_texture_upload_buffer_ring_info.reset_allocation_stats();
-
 		std::chrono::time_point<steady_clock> submit_end = steady_clock::now();
 		m_flip_time += std::chrono::duration_cast<std::chrono::microseconds>(submit_end - submit_start).count();
 	}
 
+	init_buffers();
+
+	if (!framebuffer_status_valid)
+		return;
+
 	VkDescriptorSetAllocateInfo alloc_info = {};
-	alloc_info.descriptorPool = descriptor_pool;
+	alloc_info.descriptorPool = m_current_frame->descriptor_pool;
 	alloc_info.descriptorSetCount = 1;
 	alloc_info.pSetLayouts = &descriptor_layouts;
 	alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
@@ -839,16 +841,11 @@ void VKGSRender::begin()
 	VkDescriptorSet new_descriptor_set;
 	CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &new_descriptor_set));
 
-	descriptor_sets = new_descriptor_set;
-	m_used_descriptors++;
+	m_current_frame->descriptor_set = new_descriptor_set;
+	m_current_frame->used_descriptors++;
 
 	std::chrono::time_point<steady_clock> start = steady_clock::now();
 
-	init_buffers();
-
-	if (!framebuffer_status_valid)
-		return;
-
 	float actual_line_width = rsx::method_registers.line_width();
 
 	vkCmdSetLineWidth(*m_current_command_buffer, actual_line_width);
@@ -901,20 +898,28 @@ void VKGSRender::end()
 		return;
 	}
 
-	std::chrono::time_point<steady_clock> program_start = steady_clock::now();
-
 	//Load program here since it is dependent on vertex state
-	if (!load_program())
+	if (!check_program_status())
 	{
 		LOG_ERROR(RSX, "No valid program bound to pipeline. Skipping draw");
 		rsx::thread::end();
 		return;
 	}
 
-	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
-	//m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
+	//Programs data is dependent on vertex state
+	std::chrono::time_point<steady_clock> vertex_start = steady_clock::now();
+	auto upload_info = upload_vertex_data();
+	std::chrono::time_point<steady_clock> vertex_end = steady_clock::now();
+	m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count();
 
-	close_render_pass();	//Texture upload stuff conflicts active RPs
+	//Load program
+	std::chrono::time_point<steady_clock> program_start = steady_clock::now();
+	load_program(std::get<2>(upload_info), std::get<3>(upload_info));
+	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
+	m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
+
+	//Close current pass to avoid conflict with texture functions
+	close_render_pass();
 
 	if (g_cfg.video.strict_rendering_mode)
 	{
@@ -967,11 +972,6 @@ void VKGSRender::end()
 		}
 	}
 
-	std::chrono::time_point<steady_clock> vertex_start0 = steady_clock::now();
-	auto upload_info = upload_vertex_data();
-	std::chrono::time_point<steady_clock> vertex_end0 = steady_clock::now();
-	m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(vertex_end0 - vertex_start0).count();
-
 	std::chrono::time_point<steady_clock> textures_start = steady_clock::now();
 
 	for (int i = 0; i < rsx::limits::fragment_textures_count; ++i)
@@ -980,7 +980,7 @@ void VKGSRender::end()
 		{
 			if (!rsx::method_registers.fragment_textures[i].enabled())
 			{
-				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), descriptor_sets);
+				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), m_current_frame->descriptor_set);
 				continue;
 			}
 
@@ -989,7 +989,7 @@ void VKGSRender::end()
 			if (!texture0)
 			{
 				LOG_ERROR(RSX, "Texture upload failed to texture index %d. Binding null sampler.", i);
-				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), descriptor_sets);
+				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), m_current_frame->descriptor_set);
 				continue;
 			}
 
@@ -1016,7 +1016,7 @@ void VKGSRender::end()
 				mip_mode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
 			}
 			
-			m_sampler_to_clean.push_back(std::make_unique<vk::sampler>(
+			m_current_frame->samplers_to_clean.push_back(std::make_unique<vk::sampler>(
 				*m_device,
 				vk::vk_wrap_mode(rsx::method_registers.fragment_textures[i].wrap_s()), vk::vk_wrap_mode(rsx::method_registers.fragment_textures[i].wrap_t()), vk::vk_wrap_mode(rsx::method_registers.fragment_textures[i].wrap_r()),
 				!!(rsx::method_registers.fragment_textures[i].format() & CELL_GCM_TEXTURE_UN),
@@ -1024,7 +1024,7 @@ void VKGSRender::end()
 				min_filter, vk::get_mag_filter(rsx::method_registers.fragment_textures[i].mag_filter()), mip_mode, vk::get_border_color(rsx::method_registers.fragment_textures[i].border_color()),
 				is_depth_texture, depth_compare));
 
-			m_program->bind_uniform({ m_sampler_to_clean.back()->value, texture0->value, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), descriptor_sets);
+			m_program->bind_uniform({ m_current_frame->samplers_to_clean.back()->value, texture0->value, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "tex" + std::to_string(i), m_current_frame->descriptor_set);
 		}
 	}
 	
@@ -1034,7 +1034,7 @@ void VKGSRender::end()
 		{
 			if (!rsx::method_registers.vertex_textures[i].enabled())
 			{
-				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), descriptor_sets);
+				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), m_current_frame->descriptor_set);
 				continue;
 			}
 
@@ -1043,11 +1043,11 @@ void VKGSRender::end()
 			if (!texture0)
 			{
 				LOG_ERROR(RSX, "Texture upload failed to vtexture index %d. Binding null sampler.", i);
-				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), descriptor_sets);
+				m_program->bind_uniform({ vk::null_sampler(), vk::null_image_view(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), m_current_frame->descriptor_set);
 				continue;
 			}
 
-			m_sampler_to_clean.push_back(std::make_unique<vk::sampler>(
+			m_current_frame->samplers_to_clean.push_back(std::make_unique<vk::sampler>(
 				*m_device,
 				VK_SAMPLER_ADDRESS_MODE_REPEAT, VK_SAMPLER_ADDRESS_MODE_REPEAT, VK_SAMPLER_ADDRESS_MODE_REPEAT,
 				!!(rsx::method_registers.vertex_textures[i].format() & CELL_GCM_TEXTURE_UN),
@@ -1055,7 +1055,7 @@ void VKGSRender::end()
 				VK_FILTER_NEAREST, VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST, vk::get_border_color(rsx::method_registers.vertex_textures[i].border_color())
 				));
 
-			m_program->bind_uniform({ m_sampler_to_clean.back()->value, texture0->value, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), descriptor_sets);
+			m_program->bind_uniform({ m_current_frame->samplers_to_clean.back()->value, texture0->value, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL }, "vtex" + std::to_string(i), m_current_frame->descriptor_set);
 		}
 	}
 
@@ -1066,9 +1066,12 @@ void VKGSRender::end()
 	//Only textures are synchronized tightly with the GPU and they have been read back above
 	vk::enter_uninterruptible();
 
+	set_viewport();
+
 	begin_render_pass();
+
 	vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline);
-	vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &descriptor_sets, 0, nullptr);
+	vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &m_current_frame->descriptor_set, 0, nullptr);
 
 	//Clear any 'dirty' surfaces - possible is a recycled cache surface is used
 	std::vector<VkClearAttachment> buffers_to_clear;
@@ -1108,11 +1111,7 @@ void VKGSRender::end()
 		vkCmdClearAttachments(*m_current_command_buffer, static_cast<u32>(buffers_to_clear.size()), buffers_to_clear.data(), 1, &clear_rect);
 	}
 
-	std::optional<std::tuple<VkDeviceSize, VkIndexType> > index_info = std::get<2>(upload_info);
-
-	std::chrono::time_point<steady_clock> vertex_end = steady_clock::now();
-	m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - textures_end).count();
-
+	std::optional<std::tuple<VkDeviceSize, VkIndexType> > index_info = std::get<4>(upload_info);
 	if (!index_info)
 	{
 		const auto vertex_count = std::get<1>(upload_info);
@@ -1133,7 +1132,7 @@ void VKGSRender::end()
 	vk::leave_uninterruptible();
 
 	std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
-	m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - vertex_end).count();
+	m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - textures_end).count();
 
 	copy_render_targets_to_dma_location();
 	m_draw_calls++;
@@ -1219,7 +1218,7 @@ void VKGSRender::clear_surface(u32 mask)
 	if (rsx::method_registers.surface_color_target() == rsx::surface_target::none) return;
 
 	if (!(mask & 0xF3)) return;
-	if (m_current_present_image == 0xFFFF) return;
+	if (m_current_frame->present_image == UINT32_MAX) return;
 
 	init_buffers();
 
@@ -1339,6 +1338,8 @@ void VKGSRender::copy_render_targets_to_dma_location()
 
 	if (g_cfg.video.write_color_buffers)
 	{
+		close_render_pass();
+
 		for (u8 index = 0; index < rsx::limits::color_buffers_count; index++)
 		{
 			if (!m_surface_info[index].pitch)
@@ -1351,6 +1352,8 @@ void VKGSRender::copy_render_targets_to_dma_location()
 
 	if (g_cfg.video.write_depth_buffer)
 	{
+		close_render_pass();
+
 		if (m_depth_surface_info.pitch)
 		{
 			m_texture_cache.flush_memory_to_cache(m_depth_surface_info.address, m_depth_surface_info.pitch * m_depth_surface_info.height,
@@ -1374,7 +1377,7 @@ void VKGSRender::flush_command_queue(bool hard_sync)
 	if (hard_sync)
 	{
 		//swap handler checks the pending flag, so call it here
-		process_swap_request();
+		process_swap_request(m_current_frame);
 
 		//wait for the latest intruction to execute
 		m_current_command_buffer->pending = true;
@@ -1395,83 +1398,128 @@ void VKGSRender::flush_command_queue(bool hard_sync)
 		//Grab next cb in line and make it usable
 		m_current_cb_index = (m_current_cb_index + 1) % VK_MAX_ASYNC_CB_COUNT;
 		m_current_command_buffer = &m_primary_cb_list[m_current_cb_index];
+
+		//Soft sync if a present has not yet occured before consuming the wait event
+		for (auto &ctx : frame_context)
+		{
+			if (ctx.swap_command_buffer == m_current_command_buffer)
+				process_swap_request(&ctx, true);
+		}
+
 		m_current_command_buffer->reset();
 	}
 
 	open_command_buffer();
 }
 
+void VKGSRender::advance_queued_frames()
+{
+	//Check all other frames for completion and clear resources
+	for (auto &ctx : frame_context)
+	{
+		if (&ctx == m_current_frame)
+			continue;
+
+		if (ctx.swap_command_buffer)
+		{
+			ctx.swap_command_buffer->poke();
+			if (ctx.swap_command_buffer->pending)
+				continue;
+
+			//Present the bound image
+			process_swap_request(&ctx, true);
+		}
+	}
+
+	//Only marks surfaces as dirty without actually deleting them so its safe to use
+	if (g_cfg.video.invalidate_surface_cache_every_frame)
+		m_rtts.invalidate_surface_cache_data(&*m_current_command_buffer);
+
+	//m_rtts storage is double buffered and should be safe to tag on frame boundary
+	m_rtts.free_invalidated();
+
+	//texture cache is also double buffered to prevent use-after-free
+	m_texture_cache.flush();
+
+	m_vertex_cache->purge();
+
+	m_current_queue_index = (m_current_queue_index + 1) % VK_MAX_ASYNC_FRAMES;
+	m_current_frame = &frame_context[m_current_queue_index];
+}
+
+void VKGSRender::present(frame_context_t *ctx)
+{
+	VkSwapchainKHR swap_chain = (VkSwapchainKHR)(*m_swap_chain);
+
+	VkPresentInfoKHR present = {};
+	present.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+	present.pNext = nullptr;
+	present.swapchainCount = 1;
+	present.pSwapchains = &swap_chain;
+	present.pImageIndices = &ctx->present_image;
+	CHECK_RESULT(m_swap_chain->queuePresentKHR(m_swap_chain->get_present_queue(), &present));
+}
+
 void VKGSRender::queue_swap_request()
 {
 	//buffer the swap request and return
-	if (m_swap_command_buffer && m_swap_command_buffer->pending)
+	if (m_current_frame->swap_command_buffer &&
+		m_current_frame->swap_command_buffer->pending)
 	{
 		//Its probable that no actual drawing took place
-		process_swap_request();
+		process_swap_request(m_current_frame);
 	}
 
-	m_swap_command_buffer = m_current_command_buffer;
-	close_and_submit_command_buffer({ m_present_semaphore }, m_current_command_buffer->submit_fence);
+	m_current_frame->swap_command_buffer = m_current_command_buffer;
+	close_and_submit_command_buffer({ m_current_frame->present_semaphore }, m_current_command_buffer->submit_fence);
+	m_current_frame->swap_command_buffer->pending = true;
 
 	//Grab next cb in line and make it usable
 	m_current_cb_index = (m_current_cb_index + 1) % VK_MAX_ASYNC_CB_COUNT;
 	m_current_command_buffer = &m_primary_cb_list[m_current_cb_index];
 	m_current_command_buffer->reset();
 
-	m_swap_command_buffer->pending = true;
+	//Set up new pointers for the next frame
+	advance_queued_frames();
 	open_command_buffer();
 }
 
-void VKGSRender::process_swap_request()
+void VKGSRender::process_swap_request(frame_context_t *ctx, bool free_resources)
 {
-	if (!m_swap_command_buffer)
+	if (!ctx->swap_command_buffer)
 		return;
 
-	if (m_swap_command_buffer->pending)
+	if (ctx->swap_command_buffer->pending)
 	{
 		//Perform hard swap here
-		m_swap_command_buffer->wait();
-
-		VkSwapchainKHR swap_chain = (VkSwapchainKHR)(*m_swap_chain);
-
-		VkPresentInfoKHR present = {};
-		present.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-		present.pNext = nullptr;
-		present.swapchainCount = 1;
-		present.pSwapchains = &swap_chain;
-		present.pImageIndices = &m_current_present_image;
-		CHECK_RESULT(m_swap_chain->queuePresentKHR(m_swap_chain->get_present_queue(), &present));
+		ctx->swap_command_buffer->wait();
+		free_resources = true;
 	}
 
-	//Clean up all the resources from the last frame
+	//Always present
+	present(ctx);
 
-	//Feed back damaged resources to the main texture cache for management...
-	//m_texture_cache.merge_dirty_textures(m_rtts.invalidated_resources);
-	
-	m_rtts.free_invalidated();
-	m_texture_cache.flush();
-
-	if (g_cfg.video.invalidate_surface_cache_every_frame)
-		m_rtts.invalidate_surface_cache_data(&*m_current_command_buffer);
-
-	m_buffer_view_to_clean.clear();
-	m_sampler_to_clean.clear();
-
-	m_framebuffer_to_clean.remove_if([](std::unique_ptr<vk::framebuffer_holder>& fbo)
+	if (free_resources)
 	{
-		if (fbo->deref_count >= 2) return true;
-		fbo->deref_count++;
-		return false;
-	});
+		//Cleanup of reference sensitive resources
+		//TODO: These should be double buffered as well to prevent destruction of anything in use
+		if (g_cfg.video.overlay)
+		{
+			m_text_writer->reset_descriptors();
+		}
 
-	if (g_cfg.video.overlay)
-	{
-		m_text_writer->reset_descriptors();
+		ctx->buffer_views_to_clean.clear();
+		ctx->samplers_to_clean.clear();
+
+		ctx->framebuffers_to_clean.remove_if([](std::unique_ptr<vk::framebuffer_holder>& fbo)
+		{
+			if (fbo->deref_count >= 2) return true;
+			fbo->deref_count++;
+			return false;
+		});
 	}
 
-	m_vertex_cache->purge();
-
-	m_swap_command_buffer = nullptr;
+	ctx->swap_command_buffer = nullptr;
 }
 
 void VKGSRender::do_local_task()
@@ -1482,6 +1530,7 @@ void VKGSRender::do_local_task()
 
 		//TODO: Determine if a hard sync is necessary
 		//Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
+		close_render_pass();
 		flush_command_queue();
 
 		m_flush_commands = false;
@@ -1509,11 +1558,8 @@ bool VKGSRender::do_method(u32 cmd, u32 arg)
 	}
 }
 
-bool VKGSRender::load_program(bool)
+bool VKGSRender::check_program_status()
 {
-	auto &vertex_program = current_vertex_program;
-	auto &fragment_program = current_fragment_program;
-
 	auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple<bool, u16>
 	{
 		vk::render_target *surface = nullptr;
@@ -1528,24 +1574,29 @@ bool VKGSRender::load_program(bool)
 	};
 
 	get_current_fragment_program(rtt_lookup_func);
-	if (!fragment_program.valid) return false;
+	if (!current_fragment_program.valid) return false;
 
 	get_current_vertex_program();
 
+	auto &vertex_program = current_vertex_program;
+	auto &fragment_program = current_fragment_program;
+
 	vk::pipeline_props properties = {};
 
-	properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
 	bool unused;
+	bool update_blend_constants = false;
+	bool update_stencil_info_back = false;
+	bool update_stencil_info_front = false;
+	bool update_depth_bounds = false;
+
+	properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
 	properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused);
 
 	if (rsx::method_registers.restart_index_enabled())
-	{
 		properties.ia.primitiveRestartEnable = VK_TRUE;
-	}
 	else
 		properties.ia.primitiveRestartEnable = VK_FALSE;
 
-
 	for (int i = 0; i < 4; ++i)
 	{
 		properties.att_state[i].colorWriteMask = 0xf;
@@ -1588,11 +1639,8 @@ bool VKGSRender::load_program(bool)
 			properties.att_state[render_targets[idx]].alphaBlendOp = equation_a;
 		}
 
-		auto blend_colors = rsx::get_constant_blend_colors();
-		properties.cs.blendConstants[0] = blend_colors[0];
-		properties.cs.blendConstants[1] = blend_colors[1];
-		properties.cs.blendConstants[2] = blend_colors[2];
-		properties.cs.blendConstants[3] = blend_colors[3];
+		//Blend constants are dynamic
+		update_blend_constants = true;
 	}
 	else
 	{
@@ -1618,8 +1666,7 @@ bool VKGSRender::load_program(bool)
 	if (rsx::method_registers.depth_bounds_test_enabled())
 	{
 		properties.ds.depthBoundsTestEnable = VK_TRUE;
-		properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min();
-		properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max();
+		update_depth_bounds = true;
 	}
 	else
 		properties.ds.depthBoundsTestEnable = VK_FALSE;
@@ -1627,9 +1674,6 @@ bool VKGSRender::load_program(bool)
 	if (rsx::method_registers.stencil_test_enabled())
 	{
 		properties.ds.stencilTestEnable = VK_TRUE;
-		properties.ds.front.writeMask = rsx::method_registers.stencil_mask();
-		properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask();
-		properties.ds.front.reference = rsx::method_registers.stencil_func_ref();
 		properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail());
 		properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass());
 		properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail());
@@ -1637,16 +1681,16 @@ bool VKGSRender::load_program(bool)
 
 		if (rsx::method_registers.two_sided_stencil_test_enabled())
 		{
-			properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask();
-			properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask();
-			properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref();
 			properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail());
 			properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass());
 			properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail());
 			properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func());
+			update_stencil_info_back = true;
 		}
 		else
 			properties.ds.back = properties.ds.front;
+
+		update_stencil_info_front = true;
 	}
 	else
 		properties.ds.stencilTestEnable = VK_FALSE;
@@ -1684,55 +1728,84 @@ bool VKGSRender::load_program(bool)
 	vk::enter_uninterruptible();
 
 	//Load current program from buffer
+	vertex_program.skip_vertex_input_check = true;
 	m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get();
 
-	//TODO: Update constant buffers..
-	//1. Update scale-offset matrix
-	//2. Update vertex constants
-	//3. Update fragment constants
-	const size_t scale_offset_offset = m_uniform_buffer_ring_info.alloc<256>(256);
+	vk::leave_uninterruptible();
 
-	u8 *buf = (u8*)m_uniform_buffer_ring_info.map(scale_offset_offset, 256);
-
-	/**
-	* NOTE: While VK's coord system resembles GLs, the clip volume is no longer symetrical in z
-	* Its like D3D without the flip in y (depending on how you build the spir-v)
-	*/
-	fill_scale_offset_data(buf, false);
-	fill_user_clip_data(buf + 64);
-
-	m_uniform_buffer_ring_info.unmap();
-
-	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, scale_offset_offset, 256 }, SCALE_OFFSET_BIND_SLOT, descriptor_sets);
-
-	if (true)//m_transform_constants_dirty)
+	//Update dynamic state
+	if (update_blend_constants)
 	{
-		const size_t vertex_constants_offset = m_uniform_buffer_ring_info.alloc<256>(512 * 4 * sizeof(float));
-		buf = (u8*)m_uniform_buffer_ring_info.map(vertex_constants_offset, 512 * 4 * sizeof(float));
-		fill_vertex_program_constants_data(buf);
-		*(reinterpret_cast<u32*>(buf + (468 * 4 * sizeof(float)))) = rsx::method_registers.transform_branch_bits();
-		m_uniform_buffer_ring_info.unmap();
-
-		m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_constants_offset, 512 * 4 * sizeof(float) }, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);
-		m_transform_constants_dirty = false;
+		//Update blend constants
+		auto blend_colors = rsx::get_constant_blend_colors();
+		vkCmdSetBlendConstants(*m_current_command_buffer, blend_colors.data());
 	}
 
+	if (update_stencil_info_front)
+	{
+		VkStencilFaceFlags face_flag = (update_stencil_info_back)? VK_STENCIL_FACE_FRONT_BIT: VK_STENCIL_FRONT_AND_BACK;
+
+		vkCmdSetStencilWriteMask(*m_current_command_buffer, face_flag, rsx::method_registers.stencil_mask());
+		vkCmdSetStencilCompareMask(*m_current_command_buffer, face_flag, rsx::method_registers.stencil_func_mask());
+		vkCmdSetStencilReference(*m_current_command_buffer, face_flag, rsx::method_registers.stencil_func_ref());
+
+		if (update_stencil_info_back)
+		{
+			vkCmdSetStencilWriteMask(*m_current_command_buffer, VK_STENCIL_FACE_BACK_BIT, rsx::method_registers.back_stencil_mask());
+			vkCmdSetStencilCompareMask(*m_current_command_buffer, VK_STENCIL_FACE_BACK_BIT, rsx::method_registers.back_stencil_func_mask());
+			vkCmdSetStencilReference(*m_current_command_buffer, VK_STENCIL_FACE_BACK_BIT, rsx::method_registers.back_stencil_func_ref());
+		}
+	}
+
+	if (update_depth_bounds)
+	{
+		//Update depth bounds min/max
+		vkCmdSetDepthBounds(*m_current_command_buffer, rsx::method_registers.depth_bounds_min(), rsx::method_registers.depth_bounds_max());
+	}
+
+	return true;
+}
+
+void VKGSRender::load_program(u32 vertex_count, u32 vertex_base)
+{
+	auto &vertex_program = current_vertex_program;
+	auto &fragment_program = current_fragment_program;
+
 	const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
 	const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float));
-	const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz);
+	const size_t required_mem = 512 + 8192 + fragment_buffer_sz;
 
-	buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz);
+	const size_t vertex_state_offset = m_uniform_buffer_ring_info.alloc<256>(required_mem);
+	const size_t vertex_constants_offset = vertex_state_offset + 512;
+	const size_t fragment_constants_offset = vertex_constants_offset + 8192;
+
+	//We do this in one go
+	u8 *buf = (u8*)m_uniform_buffer_ring_info.map(vertex_state_offset, required_mem);
+
+	//Vertex state
+	fill_scale_offset_data(buf, false);
+	fill_user_clip_data(buf + 64);
+	*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
+	*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
+	fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 144));
+
+	//Vertex constants
+	buf = buf + 512;
+	fill_vertex_program_constants_data(buf);
+	m_transform_constants_dirty = false;
+	
+	//Fragment constants
+	buf = buf + 8192;
 	if (fragment_constants_sz)
 		m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) }, fragment_program);
 
 	fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program);
+	
 	m_uniform_buffer_ring_info.unmap();
 
-	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);
-
-	vk::leave_uninterruptible();
-
-	return true;
+	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_state_offset, 512 }, SCALE_OFFSET_BIND_SLOT, m_current_frame->descriptor_set);
+	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_constants_offset, 8192 }, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
+	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
 }
 
 static const u32 mr_color_offset[rsx::limits::color_buffers_count] =
@@ -1762,7 +1835,20 @@ static const u32 mr_color_pitch[rsx::limits::color_buffers_count] =
 void VKGSRender::init_buffers(bool skip_reading)
 {
 	//Clear any pending swap requests
-	process_swap_request();
+	for (auto &ctx : frame_context)
+	{
+		if (ctx.swap_command_buffer)
+		{
+			if (ctx.swap_command_buffer->pending)
+				ctx.swap_command_buffer->poke();
+
+			if (!ctx.swap_command_buffer->pending)
+			{
+				//process swap without advancing the frame base
+				process_swap_request(&ctx, true);
+			}
+		}
+	}
 
 	prepare_rtts();
 
@@ -1770,8 +1856,6 @@ void VKGSRender::init_buffers(bool skip_reading)
 	{
 		read_buffers();
 	}
-
-	set_viewport();
 }
 
 void VKGSRender::read_buffers()
@@ -1965,14 +2049,13 @@ void VKGSRender::prepare_rtts()
 		}
 	}
 
-	for (auto &fbo : m_framebuffer_to_clean)
+	for (auto &fbo : m_current_frame->framebuffers_to_clean)
 	{
 		if (fbo->matches(bound_images, clip_width, clip_height))
 		{
 			m_draw_fbo.swap(fbo);
 			m_draw_fbo->reset_refs();
 			framebuffer_found = true;
-			//LOG_ERROR(RSX, "Matching framebuffer exists, using that instead");
 			break;
 		}
 	}
@@ -2014,7 +2097,7 @@ void VKGSRender::prepare_rtts()
 		VkRenderPass current_render_pass = m_render_passes[idx];
 
 		if (m_draw_fbo)
-			m_framebuffer_to_clean.push_back(std::move(m_draw_fbo));
+			m_current_frame->framebuffers_to_clean.push_back(std::move(m_draw_fbo));
 
 		m_draw_fbo.reset(new vk::framebuffer_holder(*m_device, current_render_pass, clip_width, clip_height, std::move(fbo_images)));
 	}
@@ -2060,7 +2143,7 @@ void VKGSRender::flip(int buffer)
 	std::chrono::time_point<steady_clock> flip_start = steady_clock::now();
 
 	close_render_pass();
-	process_swap_request();
+	process_swap_request(m_current_frame, true);
 
 	if (!resize_screen)
 	{
@@ -2095,8 +2178,8 @@ void VKGSRender::flip(int buffer)
 
 		aspect_ratio.size = new_size;
 
-		//Prepare surface for new frame
-		CHECK_RESULT(vkAcquireNextImageKHR((*m_device), (*m_swap_chain), 0, m_present_semaphore, VK_NULL_HANDLE, &m_current_present_image));
+		//Prepare surface for new frame. Set no timeout here so that we wait for the next image if need be
+		CHECK_RESULT(vkAcquireNextImageKHR((*m_device), (*m_swap_chain), UINT64_MAX, m_current_frame->present_semaphore, VK_NULL_HANDLE, &m_current_frame->present_image));
 
 		//Blit contents to screen..
 		vk::image* image_to_flip = nullptr;
@@ -2106,7 +2189,7 @@ void VKGSRender::flip(int buffer)
 		else if (std::get<1>(m_rtts.m_bound_render_targets[1]) != nullptr)
 			image_to_flip = std::get<1>(m_rtts.m_bound_render_targets[1]);
 
-		VkImage target_image = m_swap_chain->get_swap_chain_image(m_current_present_image);
+		VkImage target_image = m_swap_chain->get_swap_chain_image(m_current_frame->present_image);
 		if (image_to_flip)
 		{
 			vk::copy_scaled_image(*m_current_command_buffer, image_to_flip->value, target_image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
@@ -2117,9 +2200,9 @@ void VKGSRender::flip(int buffer)
 			//No draw call was issued!
 			VkImageSubresourceRange range = vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT);
 			VkClearColorValue clear_black = { 0 };
-			vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_present_image), VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_GENERAL, range);
-			vkCmdClearColorImage(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_present_image), VK_IMAGE_LAYOUT_GENERAL, &clear_black, 1, &range);
-			vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_present_image), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, range);
+			vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_frame->present_image), VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_GENERAL, range);
+			vkCmdClearColorImage(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_frame->present_image), VK_IMAGE_LAYOUT_GENERAL, &clear_black, 1, &range);
+			vk::change_image_layout(*m_current_command_buffer, m_swap_chain->get_swap_chain_image(m_current_frame->present_image), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, range);
 		}
 
 		std::unique_ptr<vk::framebuffer_holder> direct_fbo;
@@ -2144,14 +2227,14 @@ void VKGSRender::flip(int buffer)
 			size_t idx = vk::get_render_pass_location(m_swap_chain->get_surface_format(), VK_FORMAT_UNDEFINED, 1);
 			VkRenderPass single_target_pass = m_render_passes[idx];
 
-			for (auto It = m_framebuffer_to_clean.begin(); It != m_framebuffer_to_clean.end(); It++)
+			for (auto It = m_current_frame->framebuffers_to_clean.begin(); It != m_current_frame->framebuffers_to_clean.end(); It++)
 			{
 				auto &fbo = *It;
 				if (fbo->attachments[0]->info.image == target_image)
 				{
 					direct_fbo.swap(fbo);
 					direct_fbo->reset_refs();
-					m_framebuffer_to_clean.erase(It);
+					m_current_frame->framebuffers_to_clean.erase(It);
 					break;
 				}
 			}
@@ -2189,7 +2272,7 @@ void VKGSRender::flip(int buffer)
 			m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 108, direct_fbo->width(), direct_fbo->height(), message);
 
 			vk::change_image_layout(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, subres);
-			m_framebuffer_to_clean.push_back(std::move(direct_fbo));
+			m_current_frame->framebuffers_to_clean.push_back(std::move(direct_fbo));
 		}
 
 		queue_swap_request();
@@ -2262,9 +2345,13 @@ void VKGSRender::flip(int buffer)
 		m_current_command_buffer->reset();
 		open_command_buffer();
 
-		//Do cleanup
-		m_swap_command_buffer = m_current_command_buffer;
-		process_swap_request();
+		//Do cleanup; also present the previous frame for this frame if available
+		//Don't bother scheduling a swap event if the frame context is still uninitialized (no previous frame)
+		if (m_current_frame->present_image != UINT32_MAX)
+		{
+			m_current_frame->swap_command_buffer = m_current_command_buffer;
+			process_swap_request(m_current_frame);
+		}
 	}
 
 	std::chrono::time_point<steady_clock> flip_end = steady_clock::now();
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index 0c1024c467..53865c7880 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -25,11 +25,12 @@ namespace vk
 
 //Heap allocation sizes in MB
 #define VK_ATTRIB_RING_BUFFER_SIZE_M 256
-#define VK_UBO_RING_BUFFER_SIZE_M 32
+#define VK_UBO_RING_BUFFER_SIZE_M 64
 #define VK_INDEX_RING_BUFFER_SIZE_M 64
 #define VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M 128
 
 #define VK_MAX_ASYNC_CB_COUNT 64
+#define VK_MAX_ASYNC_FRAMES 2
 
 struct command_buffer_chunk: public vk::command_buffer
 {
@@ -135,32 +136,43 @@ private:
 	vk::vk_data_heap m_texture_upload_buffer_ring_info;
 
 	//Vulkan internals
-	u32 m_current_present_image = 0xFFFF;
-	VkSemaphore m_present_semaphore = nullptr;
-
 	vk::command_pool m_command_buffer_pool;
-	std::array<command_buffer_chunk, VK_MAX_ASYNC_CB_COUNT> m_primary_cb_list;
-
-	command_buffer_chunk* m_current_command_buffer = nullptr;
-	command_buffer_chunk* m_swap_command_buffer = nullptr;
-
-	u32 m_current_cb_index = 0;
 
 	std::mutex m_secondary_cb_guard;
 	vk::command_pool m_secondary_command_buffer_pool;
 	vk::command_buffer m_secondary_command_buffer;
 
-	std::array<VkRenderPass, 120> m_render_passes;
-	VkDescriptorSetLayout descriptor_layouts;
-	VkDescriptorSet descriptor_sets;
-	VkPipelineLayout pipeline_layout;
-	vk::descriptor_pool descriptor_pool;
+	u32 m_current_cb_index = 0;
+	std::array<command_buffer_chunk, VK_MAX_ASYNC_CB_COUNT> m_primary_cb_list;
+	command_buffer_chunk* m_current_command_buffer = nullptr;
+
+	std::array<VkRenderPass, 120> m_render_passes;
+
+	VkDescriptorSetLayout descriptor_layouts;
+	VkPipelineLayout pipeline_layout;
 
-	std::vector<std::unique_ptr<vk::buffer_view> > m_buffer_view_to_clean;
-	std::vector<std::unique_ptr<vk::sampler> > m_sampler_to_clean;
-	std::list<std::unique_ptr<vk::framebuffer_holder> > m_framebuffer_to_clean;
 	std::unique_ptr<vk::framebuffer_holder> m_draw_fbo;
 
+	struct frame_context_t
+	{
+		VkSemaphore present_semaphore = VK_NULL_HANDLE;
+		VkDescriptorSet descriptor_set = VK_NULL_HANDLE;
+		vk::descriptor_pool descriptor_pool;
+		u32 used_descriptors = 0;
+
+		std::vector<std::unique_ptr<vk::buffer_view>> buffer_views_to_clean;
+		std::vector<std::unique_ptr<vk::sampler>> samplers_to_clean;
+		std::list<std::unique_ptr<vk::framebuffer_holder>> framebuffers_to_clean;
+
+		u32 present_image = UINT32_MAX;
+		command_buffer_chunk* swap_command_buffer = nullptr;
+	};
+
+	std::array<frame_context_t, VK_MAX_ASYNC_FRAMES> frame_context;
+
+	u32 m_current_queue_index = 0;
+	frame_context_t* m_current_frame = nullptr;
+
 	u32 m_client_width = 0;
 	u32 m_client_height = 0;
 
@@ -183,7 +195,6 @@ private:
 	s64 m_draw_time = 0;
 	s64 m_flip_time = 0;
 
-	u32 m_used_descriptors = 0;
 	u8 m_draw_buffers_count = 0;
 
 	bool framebuffer_status_valid = false;
@@ -201,6 +212,9 @@ private:
 	std::thread::id rsx_thread;
 
 	bool render_pass_open = false;
+
+	//Vertex layout
+	rsx::vertex_input_layout m_vertex_layout;
 	
 #ifdef __linux__
 	Display *m_display_handle = nullptr;
@@ -220,15 +234,18 @@ private:
 
 	void flush_command_queue(bool hard_sync = false);
 	void queue_swap_request();
-	void process_swap_request();
+	void process_swap_request(frame_context_t *ctx, bool free_resources = false);
+	void advance_queued_frames();
+	void present(frame_context_t *ctx);
 
 	void begin_render_pass();
 	void close_render_pass();
 
-	/// returns primitive topology, is_indexed, index_count, offset in index buffer, index type
-	std::tuple<VkPrimitiveTopology, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
+	/// returns primitive topology, index_count, allocated_verts, vertex_base_index, (offset in index buffer, index type)
+	std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
 public:
-	bool load_program(bool fast_update = false);
+	bool check_program_status();
+	void load_program(u32 vertex_count, u32 vertex_base);
 	void init_buffers(bool skip_reading = false);
 	void read_buffers();
 	void write_buffers();
diff --git a/rpcs3/Emu/RSX/VK/VKProgramBuffer.h b/rpcs3/Emu/RSX/VK/VKProgramBuffer.h
index b09e636200..56ad392a56 100644
--- a/rpcs3/Emu/RSX/VK/VKProgramBuffer.h
+++ b/rpcs3/Emu/RSX/VK/VKProgramBuffer.h
@@ -19,17 +19,22 @@ namespace vk
 
 		bool operator==(const pipeline_props& other) const
 		{
-			if (memcmp(&ia, &other.ia, sizeof(VkPipelineInputAssemblyStateCreateInfo)))
-				return false;
-			if (memcmp(&ds, &other.ds, sizeof(VkPipelineDepthStencilStateCreateInfo)))
-				return false;
 			if (memcmp(&att_state[0], &other.att_state[0], sizeof(VkPipelineColorBlendAttachmentState)))
 				return false;
-			if (memcmp(&cs, &other.cs, sizeof(VkPipelineColorBlendStateCreateInfo)))
+
+			if (render_pass != other.render_pass)
 				return false;
+
 			if (memcmp(&rs, &other.rs, sizeof(VkPipelineRasterizationStateCreateInfo)))
 				return false;
-			if (render_pass != other.render_pass)
+
+			if (memcmp(&cs, &other.cs, sizeof(VkPipelineColorBlendStateCreateInfo)))
+				return false;
+
+			if (memcmp(&ia, &other.ia, sizeof(VkPipelineInputAssemblyStateCreateInfo)))
+				return false;
+
+			if (memcmp(&ds, &other.ds, sizeof(VkPipelineDepthStencilStateCreateInfo)))
 				return false;
 
 			return num_targets == other.num_targets;
@@ -90,9 +95,6 @@ struct VKTraits
 	static
 	pipeline_storage_type build_pipeline(const vertex_program_type &vertexProgramData, const fragment_program_type &fragmentProgramData, const vk::pipeline_props &pipelineProperties, VkDevice dev, VkPipelineLayout common_pipeline_layout)
 	{
-//		pstate.dynamic_state.pDynamicStates = pstate.dynamic_state_descriptors;
-//		pstate.cb.pAttachments = pstate.att_state;
-//		pstate.cb.attachmentCount = pstate.num_targets;
 
 		VkPipelineShaderStageCreateInfo shader_stages[2] = {};
 		shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
@@ -111,6 +113,11 @@ struct VKTraits
 		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_VIEWPORT;
 		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_SCISSOR;
 		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_LINE_WIDTH;
+		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_DEPTH_BOUNDS;
+		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_BLEND_CONSTANTS;
+		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
+		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_STENCIL_WRITE_MASK;
+		dynamic_state_descriptors[dynamic_state_info.dynamicStateCount++] = VK_DYNAMIC_STATE_STENCIL_REFERENCE;
 		dynamic_state_info.pDynamicStates = dynamic_state_descriptors;
 
 		VkPipelineVertexInputStateCreateInfo vi = { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
@@ -144,7 +151,6 @@ struct VKTraits
 		info.renderPass = pipelineProperties.render_pass;
 
 		CHECK_RESULT(vkCreateGraphicsPipelines(dev, nullptr, 1, &info, NULL, &pipeline));
-
 		pipeline_storage_type result = std::make_unique<vk::glsl::program>(dev, pipeline, vertexProgramData.uniforms, fragmentProgramData.uniforms);
 
 		return result;
diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
index 7642197b6a..3a99503872 100644
--- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h
+++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
@@ -21,6 +21,7 @@ namespace vk
 		bool dirty = false;
 		u16 native_pitch = 0;
 		VkImageAspectFlags attachment_aspect_flag = VK_IMAGE_ASPECT_COLOR_BIT;
+		std::unique_ptr<vk::image_view> view;
 
 		render_target *old_contents = nullptr; //Data occupying the memory location that this surface is replacing
 
@@ -40,6 +41,15 @@ namespace vk
 			:image(dev, memory_type_index, access_flags, image_type, format, width, height, depth,
 					mipmaps, layers, samples, initial_layout, tiling, usage, image_flags)
 		{}
+
+		vk::image_view* get_view()
+		{
+			if (!view)
+				view = std::make_unique<vk::image_view>(*vk::get_current_renderer(), value, VK_IMAGE_VIEW_TYPE_2D, info.format,
+						native_component_map, vk::get_image_subresource_range(0, 0, 1, 1, attachment_aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT)));
+
+			return view.get();
+		}
 	};
 
 	struct framebuffer_holder: public vk::framebuffer, public ref_counted
diff --git a/rpcs3/Emu/RSX/VK/VKTextOut.h b/rpcs3/Emu/RSX/VK/VKTextOut.h
index 014c4d1974..aba9dea317 100644
--- a/rpcs3/Emu/RSX/VK/VKTextOut.h
+++ b/rpcs3/Emu/RSX/VK/VKTextOut.h
@@ -362,6 +362,9 @@ namespace vk
 
 		void reset_descriptors()
 		{
+			if (m_used_descriptors == 0)
+				return;
+
 			vkResetDescriptorPool(device, m_descriptor_pool, 0);
 			m_used_descriptors = 0;
 		}
diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h
index 9836036e89..a1387930fa 100644
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@@ -322,9 +322,14 @@ namespace vk
 		std::pair<u32, u32> read_only_range = std::make_pair(0xFFFFFFFF, 0);
 		std::pair<u32, u32> no_access_range = std::make_pair(0xFFFFFFFF, 0);
 		
+		//Stuff that has been dereferenced goes into these
 		std::vector<std::unique_ptr<vk::image_view> > m_temporary_image_view;
 		std::vector<std::unique_ptr<vk::image>> m_dirty_textures;
 
+		//Stuff that has been dereferenced twice goes here. Contents are evicted before new ones are added
+		std::vector<std::unique_ptr<vk::image_view>> m_image_views_to_purge;
+		std::vector<std::unique_ptr<vk::image>> m_images_to_purge;
+
 		// Keep track of cache misses to pre-emptively flush some addresses
 		struct framebuffer_memory_characteristics
 		{
@@ -431,6 +436,9 @@ namespace vk
 
 			m_temporary_image_view.clear();
 			m_dirty_textures.clear();
+
+			m_image_views_to_purge.clear();
+			m_images_to_purge.clear();
 		}
 
 		//Helpers
@@ -544,7 +552,7 @@ namespace vk
 			}
 
 			//First check if it exists as an rtt...
-			vk::image *rtt_texture = nullptr;
+			vk::render_target *rtt_texture = nullptr;
 			if (rtt_texture = m_rtts.get_texture_from_render_target_if_applicable(texaddr))
 			{
 				if (g_cfg.video.strict_rendering_mode)
@@ -559,10 +567,7 @@ namespace vk
 					}
 				}
 
-				m_temporary_image_view.push_back(std::make_unique<vk::image_view>(*vk::get_current_renderer(), rtt_texture->value, VK_IMAGE_VIEW_TYPE_2D, rtt_texture->info.format,
-					rtt_texture->native_component_map,
-					vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT)));
-				return m_temporary_image_view.back().get();
+				return rtt_texture->get_view();
 			}
 
 			if (rtt_texture = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr))
@@ -576,10 +581,7 @@ namespace vk
 					}
 				}
 
-				m_temporary_image_view.push_back(std::make_unique<vk::image_view>(*vk::get_current_renderer(), rtt_texture->value, VK_IMAGE_VIEW_TYPE_2D, rtt_texture->info.format,
-					rtt_texture->native_component_map,
-					vk::get_image_subresource_range(0, 0, 1, 1, VK_IMAGE_ASPECT_DEPTH_BIT)));
-				return m_temporary_image_view.back().get();
+				return rtt_texture->get_view();
 			}
 
 			u32 raw_format = tex.format();
@@ -912,8 +914,11 @@ namespace vk
 
 		void flush()
 		{
-			m_dirty_textures.clear();
-			m_temporary_image_view.clear();
+			m_image_views_to_purge.clear();
+			m_images_to_purge.clear();
+
+			m_image_views_to_purge = std::move(m_temporary_image_view);
+			m_images_to_purge = std::move(m_dirty_textures);
 		}
 
 		void record_cache_miss(cached_texture_section &tex)
diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
index dbad6d15db..6895c2db8b 100644
--- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
@@ -7,53 +7,6 @@
 
 namespace vk
 {
-	bool requires_component_expansion(rsx::vertex_base_type type, u32 size)
-	{
-		if (size == 3)
-		{
-			switch (type)
-			{
-			case rsx::vertex_base_type::f:
-				return true;
-			}
-		}
-
-		return false;
-	}
-
-	u32 get_suitable_vk_size(rsx::vertex_base_type type, u32 size)
-	{
-		if (size == 3)
-		{
-			switch (type)
-			{
-			case rsx::vertex_base_type::f:
-				return 16;
-			}
-		}
-
-		return rsx::get_vertex_type_size_on_host(type, size);
-	}
-
-	VkFormat get_suitable_vk_format(rsx::vertex_base_type type, u8 size)
-	{
-		/**
-		* Set up buffer fetches to only work on 4-component access. This is hardware dependant so we use 4-component access to avoid branching based on IHV implementation
-		* AMD GCN 1.0 for example does not support RGB32 formats for texel buffers
-		*/
-		const VkFormat vec1_types[] = { VK_FORMAT_R16_SNORM, VK_FORMAT_R32_SFLOAT, VK_FORMAT_R16_SFLOAT, VK_FORMAT_R8_UNORM, VK_FORMAT_R16_SINT, VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R8_UINT };
-		const VkFormat vec2_types[] = { VK_FORMAT_R16G16_SNORM, VK_FORMAT_R32G32_SFLOAT, VK_FORMAT_R16G16_SFLOAT, VK_FORMAT_R8G8_UNORM, VK_FORMAT_R16G16_SINT, VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R8G8_UINT };
-		const VkFormat vec3_types[] = { VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R32G32B32A32_SFLOAT, VK_FORMAT_R16G16B16A16_SFLOAT, VK_FORMAT_R8G8B8A8_UNORM, VK_FORMAT_R16G16B16A16_SINT, VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R8G8B8A8_UINT };	//VEC3 COMPONENTS NOT SUPPORTED!
-		const VkFormat vec4_types[] = { VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R32G32B32A32_SFLOAT, VK_FORMAT_R16G16B16A16_SFLOAT, VK_FORMAT_R8G8B8A8_UNORM, VK_FORMAT_R16G16B16A16_SINT, VK_FORMAT_R16G16B16A16_SNORM, VK_FORMAT_R8G8B8A8_UINT };
-
-		const VkFormat* vec_selectors[] = { 0, vec1_types, vec2_types, vec3_types, vec4_types };
-
-		if (type > rsx::vertex_base_type::ub256)
-			fmt::throw_exception("VKGS error: unknown vertex base type 0x%x" HERE, (u32)type);
-
-		return vec_selectors[size][(int)type];
-	}
-
 	VkPrimitiveTopology get_appropriate_topology(rsx::primitive_type& mode, bool &requires_modification)
 	{
 		requires_modification = false;
@@ -92,114 +45,6 @@ namespace vk
 		return !result;
 	}
 
-	template <typename T, u32 padding>
-	void copy_inlined_data_to_buffer(void *src_data, void *dst_data, u32 vertex_count, rsx::vertex_base_type type, u8 src_channels, u8 dst_channels, u16 element_size, u16 stride)
-	{
-		u8 *src = static_cast<u8*>(src_data);
-		u8 *dst = static_cast<u8*>(dst_data);
-
-		for (u32 i = 0; i < vertex_count; ++i)
-		{
-			T* src_ptr = reinterpret_cast<T*>(src);
-			T* dst_ptr = reinterpret_cast<T*>(dst);
-
-			switch (type)
-			{
-			case rsx::vertex_base_type::ub:
-			{
-				if (src_channels == 4)
-				{
-					dst[0] = src[3];
-					dst[1] = src[2];
-					dst[2] = src[1];
-					dst[3] = src[0];
-
-					break;
-				}
-			}
-			default:
-			{
-				for (u8 ch = 0; ch < dst_channels; ++ch)
-				{
-					if (ch < src_channels)
-					{
-						*dst_ptr = *src_ptr;
-						src_ptr++;
-					}
-					else
-						*dst_ptr = (T)(padding);
-
-					dst_ptr++;
-				}
-			}
-			}
-
-			src += stride;
-			dst += element_size;
-		}
-	}
-
-	void prepare_buffer_for_writing(void *data, rsx::vertex_base_type type, u8 vertex_size, u32 vertex_count)
-	{
-		switch (type)
-		{
-		case rsx::vertex_base_type::f:
-		{
-			if (vertex_size == 3)
-			{
-				float *dst = reinterpret_cast<float*>(data);
-				for (u32 i = 0, idx = 3; i < vertex_count; ++i, idx += 4)
-					dst[idx] = 1.f;
-			}
-
-			break;
-		}
-		case rsx::vertex_base_type::sf:
-		{
-			if (vertex_size == 3)
-			{
-				/**
-				* Pad the 4th component for half-float arrays to 1, since texelfetch does not mask components
-				*/
-				u16 *dst = reinterpret_cast<u16*>(data);
-				for (u32 i = 0, idx = 3; i < vertex_count; ++i, idx += 4)
-					dst[idx] = 0x3c00;
-			}
-
-			break;
-		}
-		}
-	}
-
-	/**
-	* Template: Expand any N-compoent vector to a larger X-component vector and pad unused slots with 1
-	*/
-	template<typename T, u8 src_components, u8 dst_components, u32 padding>
-	void expand_array_components(const T* src_data, void *dst_ptr, u32 vertex_count)
-	{
-		T* src = const_cast<T*>(src_data);
-		T* dst = static_cast<T*>(dst_ptr);
-
-		for (u32 index = 0; index < vertex_count; ++index)
-		{
-			for (u8 channel = 0; channel < dst_components; channel++)
-			{
-				if (channel < src_components)
-				{
-					*dst = *src;
-
-					dst++;
-					src++;
-				}
-				else
-				{
-					*dst = (T)(padding);
-					dst++;
-				}
-			}
-		}
-	}
-
 	VkIndexType get_index_type(rsx::index_array_type type)
 	{
 		switch (type)
@@ -215,20 +60,7 @@ namespace vk
 
 namespace
 {
-	static constexpr std::array<const char*, 16> s_reg_table =
-	{
-		"in_pos_buffer", "in_weight_buffer", "in_normal_buffer",
-		"in_diff_color_buffer", "in_spec_color_buffer",
-		"in_fog_buffer",
-		"in_point_size_buffer", "in_7_buffer",
-		"in_tc0_buffer", "in_tc1_buffer", "in_tc2_buffer", "in_tc3_buffer",
-		"in_tc4_buffer", "in_tc5_buffer", "in_tc6_buffer", "in_tc7_buffer"
-	};
 
-	/**
-	* Creates and fills an index buffer emulating unsupported primitive type.
-	* Returns index_count and (offset_in_index_buffer, index_type)
-	*/
 	std::tuple<u32, std::tuple<VkDeviceSize, VkIndexType>> generate_emulating_index_buffer(
 		const rsx::draw_clause& clause, u32 vertex_count,
 		vk::vk_data_heap& m_index_buffer_ring_info)
@@ -247,161 +79,49 @@ namespace
 			index_count, std::make_tuple(offset_in_index_buffer, VK_INDEX_TYPE_UINT16));
 	}
 
-	struct vertex_buffer_visitor
+	struct vertex_input_state
 	{
-		vertex_buffer_visitor(u32 vtx_cnt, VkDevice dev, vk::vk_data_heap& heap,
-			vk::glsl::program* prog, VkDescriptorSet desc_set,
-			std::vector<std::unique_ptr<vk::buffer_view>>& buffer_view_to_clean,
-			vk::vertex_cache* vertex_cache)
-			: vertex_count(vtx_cnt), m_attrib_ring_info(heap), device(dev), m_program(prog),
-			  descriptor_sets(desc_set), m_buffer_view_to_clean(buffer_view_to_clean),
-			  vertex_cache(vertex_cache)
-		{
-		}
-
-		void operator()(const rsx::vertex_array_buffer& vertex_array)
-		{
-			if (!m_program->has_uniform(s_reg_table[vertex_array.index]))
-				return;
-
-			// Fill vertex_array
-			const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
-			const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
-			const u32 upload_size = real_element_size * vertex_count;
-			const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);
-			const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
-
-			if (auto found = vertex_cache->find_vertex_range(local_addr, format, upload_size))
-			{
-				m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, found->offset_in_heap, upload_size));
-				m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_array.index], descriptor_sets);
-				return;
-			}
-
-			VkDeviceSize offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(upload_size);
-			void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
-			
-			gsl::span<gsl::byte> dest_span(static_cast<gsl::byte*>(dst), upload_size);
-			write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size);
-
-			//Padding the vertex buffer should be done after the writes have been done
-			//write_vertex_data function may 'dirty' unused sections of the buffer as optimization
-			vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
-
-			m_attrib_ring_info.unmap();
-
-			vertex_cache->store_range(local_addr, format, upload_size, (u32)offset_in_attrib_buffer);
-
-			m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, upload_size));
-			m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_array.index], descriptor_sets);
-		}
-
-		void operator()(const rsx::vertex_array_register& vertex_register)
-		{
-			if (!m_program->has_uniform(s_reg_table[vertex_register.index]))
-				return;
-
-			size_t data_size = rsx::get_vertex_type_size_on_host(vertex_register.type, vertex_register.attribute_size);
-			const VkFormat format = vk::get_suitable_vk_format(vertex_register.type, vertex_register.attribute_size);
-
-			size_t offset_in_attrib_buffer = 0;
-
-			if (vk::requires_component_expansion(vertex_register.type, vertex_register.attribute_size))
-			{
-				const u32 num_stored_verts = static_cast<u32>(
-					data_size / (sizeof(float) * vertex_register.attribute_size));
-				const u32 real_element_size = vk::get_suitable_vk_size(vertex_register.type, vertex_register.attribute_size);
-
-				data_size = real_element_size * num_stored_verts;
-				offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(data_size);
-				void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, data_size);
-
-				vk::expand_array_components<float, 3, 4, 1>(reinterpret_cast<const float*>(vertex_register.data.data()), dst, num_stored_verts);
-				m_attrib_ring_info.unmap();
-			}
-			else
-			{
-				offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(data_size);
-				void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, data_size);
-				memcpy(dst, vertex_register.data.data(), data_size);
-				m_attrib_ring_info.unmap();
-			}
-
-			m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, data_size));
-			m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_register.index], descriptor_sets);
-		}
-
-		void operator()(const rsx::empty_vertex_array& vbo)
-		{
-			if (!m_program->has_uniform(s_reg_table[vbo.index]))
-				return;
-
-			m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, VK_FORMAT_R8G8B8A8_UNORM, 0, 0));
-			m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vbo.index], descriptor_sets);
-		}
-
-	protected:
-		VkDevice device;
-		u32 vertex_count;
-		vk::vk_data_heap& m_attrib_ring_info;
-		vk::glsl::program* m_program;
-		VkDescriptorSet descriptor_sets;
-		std::vector<std::unique_ptr<vk::buffer_view>>& m_buffer_view_to_clean;
-		vk::vertex_cache* vertex_cache;
+		VkPrimitiveTopology native_primitive_type;
+		u32 vertex_draw_count;
+		u32 allocated_vertex_count;
+		u32 vertex_data_base;
+		u32 vertex_index_base;
+		std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
 	};
 
-	using attribute_storage = std::vector<std::variant<rsx::vertex_array_buffer,
-		rsx::vertex_array_register, rsx::empty_vertex_array>>;
-
 	struct draw_command_visitor
 	{
-		using result_type = std::tuple<VkPrimitiveTopology, u32,
-			std::optional<std::tuple<VkDeviceSize, VkIndexType>>>;
-
-		draw_command_visitor(VkDevice device, vk::vk_data_heap& index_buffer_ring_info,
-			vk::vk_data_heap& attrib_ring_info, vk::glsl::program* program,
-			VkDescriptorSet descriptor_sets,
-			std::vector<std::unique_ptr<vk::buffer_view>>& buffer_view_to_clean,
-			std::function<attribute_storage(
-				const rsx::rsx_state&, const std::vector<std::pair<u32, u32>>&)>
-				get_vertex_buffers_f,
-			VKGSRender *thread)
-			: m_device(device), m_index_buffer_ring_info(index_buffer_ring_info),
-			  m_attrib_ring_info(attrib_ring_info), m_program(program),
-			  m_descriptor_sets(descriptor_sets), m_buffer_view_to_clean(buffer_view_to_clean),
-			  get_vertex_buffers(get_vertex_buffers_f),
-			  rsxthr(thread)
+		draw_command_visitor(vk::vk_data_heap& index_buffer_ring_info, rsx::vertex_input_layout& layout)
+			: m_index_buffer_ring_info(index_buffer_ring_info)
+			, m_vertex_layout(layout)
 		{
 		}
 
-		result_type operator()(const rsx::draw_array_command& command)
+		vertex_input_state operator()(const rsx::draw_array_command& command)
 		{
 			bool primitives_emulated = false;
 			VkPrimitiveTopology prims = vk::get_appropriate_topology(
 				rsx::method_registers.current_draw_clause.primitive, primitives_emulated);
-			u32 index_count = 0;
-			std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
 
-			u32 min_index =
-				rsx::method_registers.current_draw_clause.first_count_commands.front().first;
-			u32 max_index =
-				rsx::method_registers.current_draw_clause.get_elements_count() + min_index - 1;
+			const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
+			const u32 min_index = rsx::method_registers.current_draw_clause.first_count_commands.front().first;
+
+			if (primitives_emulated)
+			{
+				u32 index_count;
+				std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
 
-			if (primitives_emulated) {
 				std::tie(index_count, index_info) =
 					generate_emulating_index_buffer(rsx::method_registers.current_draw_clause,
-						max_index - min_index + 1, m_index_buffer_ring_info);
-			}
-			else
-			{
-				index_count = rsx::method_registers.current_draw_clause.get_elements_count();
+						vertex_count, m_index_buffer_ring_info);
+
+				return{ prims, index_count, vertex_count, min_index, 0, index_info };
 			}
 
-			upload_vertex_buffers(min_index, max_index);
-			return std::make_tuple(prims, index_count, index_info);
+			return{ prims, vertex_count, vertex_count, min_index, 0, {} };
 		}
 
-		result_type operator()(const rsx::draw_indexed_array_command& command)
+		vertex_input_state operator()(const rsx::draw_indexed_array_command& command)
 		{
 			bool primitives_emulated = false;
 			VkPrimitiveTopology prims = vk::get_appropriate_topology(
@@ -438,146 +158,150 @@ namespace
 			std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info =
 				std::make_tuple(offset_in_index_buffer, vk::get_index_type(index_type));
 
-			upload_vertex_buffers(0, max_index);
-			return std::make_tuple(prims, index_count, index_info);
-		}
-
-		result_type operator()(const rsx::draw_inlined_array& command)
-		{
-			bool primitives_emulated = false;
-			VkPrimitiveTopology prims = vk::get_appropriate_topology(
-				rsx::method_registers.current_draw_clause.primitive, primitives_emulated);
-			u32 index_count = upload_inlined_array();
-
-			if (!primitives_emulated) {
-				return std::make_tuple(prims, index_count, std::nullopt);
+			//check for vertex arrays with frquency modifiers
+			for (auto &block : m_vertex_layout.interleaved_blocks)
+			{
+				if (block.min_divisor > 1)
+				{
+					//Ignore base offsets and return real results
+					//The upload function will optimize the uploaded range anyway
+					return{ prims, index_count, max_index, 0, 0, index_info };
+				}
 			}
 
+			return {prims, index_count, (max_index - min_index + 1), min_index, min_index, index_info};
+		}
+
+		vertex_input_state operator()(const rsx::draw_inlined_array& command)
+		{
+			bool primitives_emulated = false;
+			auto &draw_clause = rsx::method_registers.current_draw_clause;
+			VkPrimitiveTopology prims = vk::get_appropriate_topology(draw_clause.primitive, primitives_emulated);
+			
+			const u32 vertex_count = (u32)command.inline_vertex_array.size() * sizeof(u32) / m_vertex_layout.interleaved_blocks[0].attribute_stride;
+
+			if (!primitives_emulated)
+			{
+				return{ prims, vertex_count, vertex_count, 0, 0, {} };
+			}
+
+			u32 index_count;
 			std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
-			std::tie(index_count, index_info) = generate_emulating_index_buffer(
-				rsx::method_registers.current_draw_clause, index_count, m_index_buffer_ring_info);
-			return std::make_tuple(prims, index_count, index_info);
+			std::tie(index_count, index_info) = generate_emulating_index_buffer(draw_clause, vertex_count, m_index_buffer_ring_info);
+			return{ prims, index_count, vertex_count, 0, 0, index_info };
 		}
 
 	private:
 		vk::vk_data_heap& m_index_buffer_ring_info;
-		VkDevice m_device;
-		vk::vk_data_heap& m_attrib_ring_info;
-		vk::glsl::program* m_program;
-		VkDescriptorSet m_descriptor_sets;
-		std::vector<std::unique_ptr<vk::buffer_view>>& m_buffer_view_to_clean;
-		std::function<attribute_storage(
-			const rsx::rsx_state&, const std::vector<std::pair<u32, u32>>&)>
-			get_vertex_buffers;
-		VKGSRender* rsxthr;
-
-		void upload_vertex_buffers(u32 min_index, u32 vertex_max_index)
-		{
-			const u32 vertex_count = vertex_max_index - min_index + 1;
-
-			vertex_buffer_visitor visitor(vertex_count, m_device,
-				m_attrib_ring_info, m_program, m_descriptor_sets, m_buffer_view_to_clean, rsxthr->m_vertex_cache.get());
-
-			const auto& vertex_buffers = get_vertex_buffers(
-				rsx::method_registers, {{min_index, vertex_max_index - min_index + 1}});
-
-			for (auto &vbo: vertex_buffers)
-				std::apply_visitor(visitor, vbo);
-		}
-
-		u32 upload_inlined_array()
-		{
-			u32 stride = 0;
-			u32 offsets[rsx::limits::vertex_count] = {0};
-
-			for (u32 i = 0; i < rsx::limits::vertex_count; ++i) {
-				const auto& info = rsx::method_registers.vertex_arrays_info[i];
-				if (!info.size()) continue;
-
-				offsets[i] = stride;
-				stride += rsx::get_vertex_type_size_on_host(info.type(), info.size());
-			}
-
-			u32 vertex_draw_count =
-				(u32)(rsx::method_registers.current_draw_clause.inline_vertex_array.size() *
-					  sizeof(u32)) /
-				stride;
-
-			for (int index = 0; index < rsx::limits::vertex_count; ++index)
-			{
-				auto& vertex_info = rsx::method_registers.vertex_arrays_info[index];
-
-				if (!m_program->has_uniform(s_reg_table[index])) continue;
-
-				if (!vertex_info.size()) // disabled
-				{
-					m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8G8B8A8_UNORM, 0, 0));
-					m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[index], m_descriptor_sets);
-					continue;
-				}
-
-				const u32 element_size =
-					vk::get_suitable_vk_size(vertex_info.type(), vertex_info.size());
-				const u32 data_size = element_size * vertex_draw_count;
-				const VkFormat format =
-					vk::get_suitable_vk_format(vertex_info.type(), vertex_info.size());
-
-				size_t offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(data_size);
-				u8* src = reinterpret_cast<u8*>(
-					rsx::method_registers.current_draw_clause.inline_vertex_array.data());
-				u8* dst =
-					static_cast<u8*>(m_attrib_ring_info.map(offset_in_attrib_buffer, data_size));
-
-				src += offsets[index];
-				u8 opt_size = vertex_info.size();
-
-				if (vertex_info.size() == 3) opt_size = 4;
-
-				// TODO: properly handle cmp type
-				if (vertex_info.type() == rsx::vertex_base_type::cmp)
-					LOG_ERROR(RSX, "Compressed vertex attributes not supported for inlined arrays yet");
-
-				switch (vertex_info.type())
-				{
-				case rsx::vertex_base_type::f:
-					vk::copy_inlined_data_to_buffer<float, 1>(src, dst, vertex_draw_count,
-						vertex_info.type(), vertex_info.size(), opt_size, element_size, stride);
-					break;
-				case rsx::vertex_base_type::sf:
-					vk::copy_inlined_data_to_buffer<u16, 0x3c00>(src, dst, vertex_draw_count,
-						vertex_info.type(), vertex_info.size(), opt_size, element_size, stride);
-					break;
-				case rsx::vertex_base_type::s1:
-				case rsx::vertex_base_type::ub:
-				case rsx::vertex_base_type::ub256:
-					vk::copy_inlined_data_to_buffer<u8, 1>(src, dst, vertex_draw_count,
-						vertex_info.type(), vertex_info.size(), opt_size, element_size, stride);
-					break;
-				case rsx::vertex_base_type::s32k:
-				case rsx::vertex_base_type::cmp:
-					vk::copy_inlined_data_to_buffer<u16, 1>(src, dst, vertex_draw_count,
-						vertex_info.type(), vertex_info.size(), opt_size, element_size, stride);
-					break;
-				default: fmt::throw_exception("Unknown base type %d" HERE, (u32)vertex_info.type());
-				}
-
-				m_attrib_ring_info.unmap();
-				m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device,
-					m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, data_size));
-				m_program->bind_uniform(
-					m_buffer_view_to_clean.back()->value, s_reg_table[index], m_descriptor_sets);
-			}
-
-			return vertex_draw_count;
-		}
+		rsx::vertex_input_layout& m_vertex_layout;
 	};
 }
 
-std::tuple<VkPrimitiveTopology, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType>>>
+std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > >
 VKGSRender::upload_vertex_data()
 {
-	draw_command_visitor visitor(*m_device, m_index_buffer_ring_info, m_attrib_ring_info, m_program,
-		descriptor_sets, m_buffer_view_to_clean,
-		[this](const auto& state, const auto& range) { return this->get_vertex_buffers(state, range, m_program->get_vertex_input_attributes_mask());}, this);
-	return std::apply_visitor(visitor, get_draw_command(rsx::method_registers));
+	m_vertex_layout = analyse_inputs_interleaved();
+
+	draw_command_visitor visitor(m_index_buffer_ring_info, m_vertex_layout);
+	auto result = std::apply_visitor(visitor, get_draw_command(rsx::method_registers));
+
+	auto &vertex_count = result.allocated_vertex_count;
+	auto &vertex_base = result.vertex_data_base;
+
+	//Do actual vertex upload
+	auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);
+	size_t persistent_offset = UINT64_MAX, volatile_offset = UINT64_MAX;
+
+	VkBufferView persistent_view = VK_NULL_HANDLE, volatile_view = VK_NULL_HANDLE;
+
+	if (required.first > 0)
+	{
+		//Check if cacheable
+		//Only data in the 'persistent' block may be cached
+		//TODO: make vertex cache keep local data beyond frame boundaries and hook notify command
+		bool in_cache = false;
+		bool to_store = false;
+		u32  storage_address = UINT32_MAX;
+
+		if (m_vertex_layout.interleaved_blocks.size() == 1 &&
+			rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array)
+		{
+			storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base;
+			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
+			{
+				in_cache = true;
+				m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, cached->offset_in_heap, required.first));
+			}
+			else
+			{
+				to_store = true;
+			}
+		}
+
+		if (!in_cache)
+		{
+			persistent_offset = (u32)m_attrib_ring_info.alloc<256>(required.first);
+			m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_offset, required.first));
+
+			if (to_store)
+			{
+				//store ref in vertex cache
+				m_vertex_cache->store_range(storage_address, VK_FORMAT_R8_UINT, required.first, (u32)persistent_offset);
+			}
+		}
+
+		persistent_view = m_current_frame->buffer_views_to_clean.back()->value;
+	}
+	else
+	{
+		m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, 0, 0));
+		persistent_view = m_current_frame->buffer_views_to_clean.back()->value;
+	}
+
+	if (required.second > 0)
+	{
+		volatile_offset = (u32)m_attrib_ring_info.alloc<256>(required.second);
+		m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_offset, required.second));
+
+		volatile_view = m_current_frame->buffer_views_to_clean.back()->value;
+	}
+	else
+	{
+		m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, 0, 0));
+		volatile_view = m_current_frame->buffer_views_to_clean.back()->value;
+	}
+
+	m_program->bind_uniform(persistent_view, "persistent_input_stream", m_current_frame->descriptor_set);
+	m_program->bind_uniform(volatile_view, "volatile_input_stream", m_current_frame->descriptor_set);
+
+	//Write all the data once if possible
+	if (required.first && required.second && volatile_offset > persistent_offset)
+	{
+		//Do this once for both to save time on map/unmap cycles
+		const size_t block_end = (volatile_offset + required.second);
+		const size_t block_size = block_end - persistent_offset;
+		const size_t volatile_offset_in_block = volatile_offset - persistent_offset;
+
+		void *block_mapping = m_attrib_ring_info.map(persistent_offset, block_size);
+		write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, block_mapping, (char*)block_mapping + volatile_offset_in_block);
+		m_attrib_ring_info.unmap();
+	}
+	else
+	{
+		if (required.first > 0 && persistent_offset != UINT64_MAX)
+		{
+			void *persistent_mapping = m_attrib_ring_info.map(persistent_offset, required.first);
+			write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping, nullptr);
+			m_attrib_ring_info.unmap();
+		}
+
+		if (required.second > 0)
+		{
+			void *volatile_mapping = m_attrib_ring_info.map(volatile_offset, required.second);
+			write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, nullptr, volatile_mapping);
+			m_attrib_ring_info.unmap();
+		}
+	}
+
+	return std::make_tuple(result.native_primitive_type, result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, result.index_info);
 }
diff --git a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp
index 3e5351e472..633b4cf2b0 100644
--- a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp
@@ -30,17 +30,20 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS)
 {
 	OS << "#version 450\n\n";
 	OS << "#extension GL_ARB_separate_shader_objects : enable\n";
-	OS << "layout(std140, set = 0, binding = 0) uniform ScaleOffsetBuffer\n";
+	OS << "layout(std140, set = 0, binding = 0) uniform VertexContextBuffer\n";
 	OS << "{\n";
-	OS << "	mat4 scaleOffsetMat;\n";
-	OS << "	ivec4 userClipEnabled[2];\n";
-	OS << "	vec4 userClipFactor[2];\n";
+	OS << "	mat4 scale_offset_mat;\n";
+	OS << "	ivec4 user_clip_enabled[2];\n";
+	OS << "	vec4 user_clip_factor[2];\n";
+	OS << "	uint transform_branch_bits;\n";
+	OS << "	uint vertex_base_index;\n";
+	OS << "	ivec4 input_attributes[16];\n";
 	OS << "};\n";
 
 	vk::glsl::program_input in;
 	in.location = SCALE_OFFSET_BIND_SLOT;
 	in.domain = glsl::glsl_vertex_program;
-	in.name = "ScaleOffsetBuffer";
+	in.name = "VertexContextBuffer";
 	in.type = vk::glsl::input_type_uniform_buffer;
 
 	inputs.push_back(in);
@@ -48,54 +51,21 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS)
 
 void VKVertexDecompilerThread::insertInputs(std::stringstream & OS, const std::vector<ParamType>& inputs)
 {
-	std::vector<std::tuple<size_t, std::string>> input_data;
-	for (const ParamType &PT : inputs)
-	{
-		for (const ParamItem &PI : PT.items)
-		{
-			input_data.push_back(std::make_tuple(PI.location, PI.name));
-		}
-	}
+	OS << "layout(set=0, binding=3) uniform usamplerBuffer persistent_input_stream;\n";    //Data stream with persistent vertex data (cacheable)
+	OS << "layout(set=0, binding=4) uniform usamplerBuffer volatile_input_stream;\n";      //Data stream with per-draw data (registers and immediate draw data)
 
-	/**
-	 * Its is important that the locations are in the order that vertex attributes are expected.
-	 * If order is not adhered to, channels may be swapped leading to corruption
-	*/
+	vk::glsl::program_input in;
+	in.location = VERTEX_BUFFERS_FIRST_BIND_SLOT;
+	in.domain = glsl::glsl_vertex_program;
+	in.name = "persistent_input_stream";
+	in.type = vk::glsl::input_type_texel_buffer;
+	this->inputs.push_back(in);
 
-	std::sort(input_data.begin(), input_data.end());
-
-	for (const std::tuple<size_t, std::string> item : input_data)
-	{
-		for (const ParamType &PT : inputs)
-		{
-			for (const ParamItem &PI : PT.items)
-			{
-				if (PI.name == std::get<1>(item))
-				{
-					vk::glsl::program_input in;
-					in.location = (int)std::get<0>(item) + VERTEX_BUFFERS_FIRST_BIND_SLOT;
-					in.domain = glsl::glsl_vertex_program;
-					in.name = PI.name + "_buffer";
-					in.type = vk::glsl::input_type_texel_buffer;
-
-					this->inputs.push_back(in);
-					
-					bool is_int = false;
-					for (auto &attrib : rsx_vertex_program.rsx_vertex_inputs)
-					{
-						if (attrib.location == std::get<0>(item))
-						{
-							if (attrib.int_type) is_int = true;
-							break;
-						}
-					}
-
-					std::string samplerType = is_int ? "isamplerBuffer" : "samplerBuffer";
-					OS << "layout(set = 0, binding=" << in.location << ")" << "	uniform " << samplerType << " " << PI.name << "_buffer;\n";
-				}
-			}
-		}
-	}
+	in.location = VERTEX_BUFFERS_FIRST_BIND_SLOT + 1;
+	in.domain = glsl::glsl_vertex_program;
+	in.name = "volatile_input_stream";
+	in.type = vk::glsl::input_type_texel_buffer;
+	this->inputs.push_back(in);
 }
 
 void VKVertexDecompilerThread::insertConstants(std::stringstream & OS, const std::vector<ParamType> & constants)
@@ -103,7 +73,6 @@ void VKVertexDecompilerThread::insertConstants(std::stringstream & OS, const std
 	OS << "layout(std140, set=0, binding = 1) uniform VertexConstantsBuffer\n";
 	OS << "{\n";
 	OS << "	vec4 vc[468];\n";
-	OS << "	uint transform_branch_bits;\n";
 	OS << "};\n\n";
 
 	vk::glsl::program_input in;
@@ -150,13 +119,13 @@ static const vertex_reg_info reg_table[] =
 	{ "front_spec_color", true, "dst_reg4", "", false },
 	{ "fog_c", true, "dst_reg5", ".xxxx", true, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FOG },
 	//Warning: With spir-v if you declare clip distance var, you must assign a value even when its disabled! Runtime does not assign a default value
-	{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * userClipFactor[0].x", false, "userClipEnabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 },
-	{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * userClipFactor[0].y", false, "userClipEnabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 },
-	{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * userClipFactor[0].z", false, "userClipEnabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
+	{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * user_clip_factor[0].x", false, "user_clip_enabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 },
+	{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * user_clip_factor[0].y", false, "user_clip_enabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 },
+	{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * user_clip_factor[0].z", false, "user_clip_enabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
 	{ "gl_PointSize", false, "dst_reg6", ".x", false },
-	{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * userClipFactor[0].w", false, "userClipEnabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 },
-	{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * userClipFactor[1].x", false, "userClipEnabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 },
-	{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * userClipFactor[1].y", false, "userClipEnabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
+	{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * user_clip_factor[0].w", false, "user_clip_enabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 },
+	{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * user_clip_factor[1].x", false, "user_clip_enabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 },
+	{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * user_clip_factor[1].y", false, "user_clip_enabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
 	{ "tc0", true, "dst_reg7", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX0 },
 	{ "tc1", true, "dst_reg8", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX1 },
 	{ "tc2", true, "dst_reg9", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX2 },
@@ -212,44 +181,10 @@ void VKVertexDecompilerThread::insertOutputs(std::stringstream & OS, const std::
 		OS << "layout(location=" << vk::get_varying_register("front_spec_color").reg_location << ") out vec4 front_spec_color;\n";
 }
 
-namespace vk
-{
-	void add_input(std::stringstream & OS, const ParamItem &PI, const std::vector<rsx_vertex_input> &inputs)
-	{
-		for (const auto &real_input : inputs)
-		{
-			if (real_input.location != PI.location)
-				continue;
-
-			if (!real_input.is_array)
-			{
-				OS << "	vec4 " << PI.name << " = vec4(texelFetch(" << PI.name << "_buffer, 0));\n";
-				return;
-			}
-
-			if (real_input.frequency > 1)
-			{
-				if (real_input.is_modulo)
-				{
-					OS << "	vec4 " << PI.name << "= vec4(texelFetch(" << PI.name << "_buffer, gl_VertexIndex %" << real_input.frequency << "));\n";
-					return;
-				}
-
-				OS << "	vec4 " << PI.name << "= vec4(texelFetch(" << PI.name << "_buffer, gl_VertexIndex /" << real_input.frequency << "));\n";
-				return;
-			}
-
-			OS << "	vec4 " << PI.name << "= vec4(texelFetch(" << PI.name << "_buffer, gl_VertexIndex).rgba);\n";
-			return;
-		}
-
-		OS << "	vec4 " << PI.name << "= vec4(texelFetch(" << PI.name << "_buffer, gl_VertexIndex).rgba);\n";
-	}
-}
-
 void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 {
 	glsl::insert_glsl_legacy_function(OS, glsl::glsl_vertex_program);
+	glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_rpirv);
 
 	std::string parameters = "";
 	for (int i = 0; i < 16; ++i)
@@ -286,7 +221,9 @@ void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 	for (const ParamType &PT : m_parr.params[PF_PARAM_IN])
 	{
 		for (const ParamItem &PI : PT.items)
-			vk::add_input(OS, PI, rsx_vertex_program.rsx_vertex_inputs);
+		{
+			OS << "	vec4 " << PI.name << "= read_location(" << std::to_string(PI.location) << ");\n";
+		}
 	}
 }
 
@@ -373,7 +310,7 @@ void VKVertexDecompilerThread::insertMainEnd(std::stringstream & OS)
 		if (m_parr.HasParam(PF_PARAM_NONE, "vec4", "dst_reg2"))
 			OS << "	front_spec_color = dst_reg2;\n";
 
-	OS << "	gl_Position = gl_Position * scaleOffsetMat;\n";
+	OS << "	gl_Position = gl_Position * scale_offset_mat;\n";
 	OS << "}\n";
 }