From f43824762a96d31c84b33f447fb0d82c39a41115 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Thu, 8 Sep 2022 00:02:52 +0300 Subject: [PATCH] rsx: Get rid of an allocation in analyse_vertex_data that adds about 5% overhead. This method is called many thousands of times per frame and that single allocation introduces a small perf hit. Just get rid of it, it doesn't improve anything to have it there. --- rpcs3/Emu/RSX/GL/GLDraw.cpp | 2 +- rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp | 6 ++-- rpcs3/Emu/RSX/RSXThread.cpp | 49 ++++++++++++++-------------- rpcs3/Emu/RSX/RSXThread.h | 32 ++++++++++++++---- rpcs3/Emu/RSX/VK/VKDraw.cpp | 2 +- rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp | 6 ++-- 6 files changed, 57 insertions(+), 40 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLDraw.cpp b/rpcs3/Emu/RSX/GL/GLDraw.cpp index 17c45647fb..f43625a91c 100644 --- a/rpcs3/Emu/RSX/GL/GLDraw.cpp +++ b/rpcs3/Emu/RSX/GL/GLDraw.cpp @@ -467,7 +467,7 @@ void GLGSRender::emit_geometry(u32 sub_index) for (auto& info : m_vertex_layout.interleaved_blocks) { const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset(); - info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location); + info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info->base_offset), info->memory_location); } } diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp index 72a1984c9f..b214b4cbfd 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp @@ -127,7 +127,7 @@ namespace vertex_input_state operator()(const rsx::draw_inlined_array& /*command*/) { const auto stream_length = rsx::method_registers.current_draw_clause.inline_vertex_array.size(); - const u32 vertex_count = u32(stream_length * sizeof(u32)) / m_vertex_layout.interleaved_blocks[0].attribute_stride; + const u32 vertex_count = u32(stream_length * sizeof(u32)) / m_vertex_layout.interleaved_blocks[0]->attribute_stride; if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) { @@ -192,8 +192,8 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer() if (m_vertex_layout.interleaved_blocks.size() == 1 && rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array) { - const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0].attribute_stride); - storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + data_offset; + const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride); + storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset; if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first)) { diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index a13f714477..68808fd12e 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -1933,9 +1933,8 @@ namespace rsx if (state.current_draw_clause.command == rsx::draw_command::inlined_array) { - interleaved_range_info info = {}; + interleaved_range_info& info = *result.alloc_interleaved_block(); info.interleaved = true; - info.locations.reserve(8); for (u8 index = 0; index < rsx::limits::vertex_count; ++index) { @@ -1963,7 +1962,7 @@ namespace rsx if (info.attribute_stride) { // At least one array feed must be enabled for vertex input - result.interleaved_blocks.emplace_back(std::move(info)); + result.interleaved_blocks.push_back(&info); } return; @@ -2030,21 +2029,21 @@ namespace rsx for (auto &block : result.interleaved_blocks) { - if (block.single_vertex) + if (block->single_vertex) { //Single vertex definition, continue continue; } - if (block.attribute_stride != info.stride()) + if (block->attribute_stride != info.stride()) { //Stride does not match, continue continue; } - if (base_address > block.base_offset) + if (base_address > block->base_offset) { - const u32 diff = base_address - block.base_offset; + const u32 diff = base_address - block->base_offset; if (diff > info.stride()) { //Not interleaved, continue @@ -2053,7 +2052,7 @@ namespace rsx } else { - const u32 diff = block.base_offset - base_address; + const u32 diff = block->base_offset - base_address; if (diff > info.stride()) { //Not interleaved, continue @@ -2061,18 +2060,18 @@ namespace rsx } //Matches, and this address is lower than existing - block.base_offset = base_address; + block->base_offset = base_address; } alloc_new_block = false; - block.locations.push_back({ index, modulo, info.frequency() }); - block.interleaved = true; + block->locations.push_back({ index, modulo, info.frequency() }); + block->interleaved = true; break; } if (alloc_new_block) { - interleaved_range_info block = {}; + interleaved_range_info& block = *result.alloc_interleaved_block(); block.base_offset = base_address; block.attribute_stride = info.stride(); block.memory_location = info.offset() >> 31; @@ -2085,7 +2084,7 @@ namespace rsx block.attribute_stride = rsx::get_vertex_type_size_on_host(info.type(), info.size()); } - result.interleaved_blocks.emplace_back(std::move(block)); + result.interleaved_blocks.push_back(&block); } } } @@ -2093,7 +2092,7 @@ namespace rsx for (auto &info : result.interleaved_blocks) { //Calculate real data address to be used during upload - info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info.base_offset), info.memory_location); + info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info->base_offset), info->memory_location); } } @@ -2353,7 +2352,7 @@ namespace rsx { for (const auto &block : layout.interleaved_blocks) { - volatile_memory_size += block.attribute_stride * vertex_count; + volatile_memory_size += block->attribute_stride * vertex_count; } } else @@ -2400,7 +2399,7 @@ namespace rsx { const auto &block = layout.interleaved_blocks[0]; u32 inline_data_offset = volatile_offset; - for (const auto& attrib : block.locations) + for (const auto& attrib : block->locations) { auto &info = rsx::method_registers.vertex_arrays_info[attrib.index]; @@ -2412,14 +2411,14 @@ namespace rsx { for (const auto &block : layout.interleaved_blocks) { - for (const auto& attrib : block.locations) + for (const auto& attrib : block->locations) { const u32 local_address = (rsx::method_registers.vertex_arrays_info[attrib.index].offset() & 0x7fffffff); - offset_in_block[attrib.index] = persistent_offset + (local_address - block.base_offset); + offset_in_block[attrib.index] = persistent_offset + (local_address - block->base_offset); } - const auto range = block.calculate_required_range(first_vertex, vertex_count); - persistent_offset += block.attribute_stride * range.second; + const auto range = block->calculate_required_range(first_vertex, vertex_count); + persistent_offset += block->attribute_stride * range.second; } } @@ -2484,7 +2483,7 @@ namespace rsx type = info.type(); size = info.size(); - attrib0 = layout.interleaved_blocks[0].attribute_stride | default_frequency_mask; + attrib0 = layout.interleaved_blocks[0]->attribute_stride | default_frequency_mask; } } else @@ -2624,12 +2623,12 @@ namespace rsx { for (const auto &block : layout.interleaved_blocks) { - auto range = block.calculate_required_range(first_vertex, vertex_count); + auto range = block->calculate_required_range(first_vertex, vertex_count); - const u32 data_size = range.second * block.attribute_stride; - const u32 vertex_base = range.first * block.attribute_stride; + const u32 data_size = range.second * block->attribute_stride; + const u32 vertex_base = range.first * block->attribute_stride; - g_fxo->get().copy(persistent, vm::_ptr(block.real_offset_address) + vertex_base, data_size); + g_fxo->get().copy(persistent, vm::_ptr(block->real_offset_address) + vertex_base, data_size); persistent += data_size; } } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index e1f194fda9..e008b98b01 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -287,18 +287,36 @@ namespace rsx transient = 2 }; - struct vertex_input_layout + class vertex_input_layout { - std::vector interleaved_blocks{}; // Interleaved blocks to be uploaded as-is - std::vector> volatile_blocks{}; // Volatile data blocks (immediate draw vertex data for example) - rsx::simple_array referenced_registers{}; // Volatile register data + int m_num_used_blocks = 0; + std::array m_blocks_data{}; + + public: + rsx::simple_array interleaved_blocks{}; // Interleaved blocks to be uploaded as-is + std::vector> volatile_blocks{}; // Volatile data blocks (immediate draw vertex data for example) + rsx::simple_array referenced_registers{}; // Volatile register data std::array attribute_placement = fill_array(attribute_buffer_placement::none); vertex_input_layout() = default; + interleaved_range_info* alloc_interleaved_block() + { + auto result = &m_blocks_data[m_num_used_blocks++]; + result->attribute_stride = 0; + result->base_offset = 0; + result->memory_location = 0; + result->real_offset_address = 0; + result->single_vertex = false; + result->locations.clear(); + result->interleaved = true; + return result; + } + void clear() { + m_num_used_blocks = 0; interleaved_blocks.clear(); volatile_blocks.clear(); referenced_registers.clear(); @@ -309,7 +327,7 @@ namespace rsx // Criteria: At least one array stream has to be defined to feed vertex positions // This stream cannot be a const register as the vertices cannot create a zero-area primitive - if (!interleaved_blocks.empty() && interleaved_blocks.front().attribute_stride != 0) + if (!interleaved_blocks.empty() && interleaved_blocks[0]->attribute_stride != 0) return true; if (!volatile_blocks.empty()) @@ -351,8 +369,8 @@ namespace rsx u32 mem = 0; for (auto &block : interleaved_blocks) { - const auto range = block.calculate_required_range(first_vertex, vertex_count); - mem += range.second * block.attribute_stride; + const auto range = block->calculate_required_range(first_vertex, vertex_count); + mem += range.second * block->attribute_stride; } return mem; diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index a0cd0318d2..9bf54c6c82 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -677,7 +677,7 @@ void VKGSRender::emit_geometry(u32 sub_index) for (auto& info : m_vertex_layout.interleaved_blocks) { const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset(); - info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location); + info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info->base_offset), info->memory_location); } } diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index a06b97583e..0dcc343a4b 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -206,7 +206,7 @@ namespace VkPrimitiveTopology prims = vk::get_appropriate_topology(draw_clause.primitive, primitives_emulated); const auto stream_length = rsx::method_registers.current_draw_clause.inline_vertex_array.size(); - const u32 vertex_count = u32(stream_length * sizeof(u32)) / m_vertex_layout.interleaved_blocks[0].attribute_stride; + const u32 vertex_count = u32(stream_length * sizeof(u32)) / m_vertex_layout.interleaved_blocks[0]->attribute_stride; if (!primitives_emulated) { @@ -257,8 +257,8 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data() if (m_vertex_layout.interleaved_blocks.size() == 1 && rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array) { - const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0].attribute_stride); - storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + data_offset; + const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride); + storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset; if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first)) {