rsx/vk: Improvements and minor optimizations

- Improve dirty state tracking affecting program state
- vk: Refactor out transform constants upload into a separate channel to avoid if possible
  transform data uploads are quite expensive
This commit is contained in:
kd-11 2018-04-20 23:44:34 +03:00 committed by kd-11
parent 440a31ef18
commit b7979d3f57
8 changed files with 130 additions and 75 deletions

View File

@ -141,18 +141,22 @@ fragment_program_utils::fragment_program_metadata fragment_program_utils::analys
if (program_offset < 0)
program_offset = instIndex * 16;
if (opcode == RSX_FP_OPCODE_TEX ||
opcode == RSX_FP_OPCODE_TEXBEM ||
opcode == RSX_FP_OPCODE_TXP ||
opcode == RSX_FP_OPCODE_TXPBEM ||
opcode == RSX_FP_OPCODE_TXD ||
opcode == RSX_FP_OPCODE_TXB ||
opcode == RSX_FP_OPCODE_TXL)
switch(opcode)
{
case RSX_FP_OPCODE_TEX:
case RSX_FP_OPCODE_TEXBEM:
case RSX_FP_OPCODE_TXP:
case RSX_FP_OPCODE_TXPBEM:
case RSX_FP_OPCODE_TXD:
case RSX_FP_OPCODE_TXB:
case RSX_FP_OPCODE_TXL:
{
//Bits 17-20 of word 1, swapped within u16 sections
//Bits 16-23 are swapped into the upper 8 bits (24-31)
const u32 tex_num = (inst.word[0] >> 25) & 15;
textures_mask |= (1 << tex_num);
break;
}
}
if (is_constant(inst.word[1]) || is_constant(inst.word[2]) || is_constant(inst.word[3]))

View File

@ -389,17 +389,17 @@ void D3D12GSRender::end()
.Offset((INT)currentDescriptorIndex + vertex_buffer_count, m_descriptor_stride_srv_cbv_uav)
);
if (m_transform_constants_dirty && !g_cfg.video.debug_output)
if (!g_cfg.video.debug_output && (m_graphics_state & rsx::pipeline_state::transform_constants_dirty))
{
m_current_transform_constants_buffer_descriptor_id = (u32)currentDescriptorIndex + 1 + vertex_buffer_count;
upload_and_bind_vertex_shader_constants(currentDescriptorIndex + 1 + vertex_buffer_count);
m_transform_constants_dirty = false;
get_current_resource_storage().command_list->SetGraphicsRootDescriptorTable(VERTEX_CONSTANT_BUFFERS_SLOT,
CD3DX12_GPU_DESCRIPTOR_HANDLE(get_current_resource_storage().descriptors_heap->GetGPUDescriptorHandleForHeapStart())
.Offset(m_current_transform_constants_buffer_descriptor_id, m_descriptor_stride_srv_cbv_uav)
);
}
m_graphics_state = 0;
std::chrono::time_point<steady_clock> constants_duration_end = steady_clock::now();
m_timers.constants_duration += std::chrono::duration_cast<std::chrono::microseconds>(constants_duration_end - constants_duration_start).count();

View File

@ -1049,7 +1049,7 @@ bool GLGSRender::check_program_state()
void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
{
if (m_fragment_program_dirty || m_vertex_program_dirty)
if (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits)
{
get_current_fragment_program(fs_sampler_state);
verify(HERE), current_fragment_program.valid;
@ -1091,12 +1091,13 @@ void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
const u32 fragment_constants_size = (const u32)m_prog_buffer.get_fragment_constants_buffer_size(current_fragment_program);
const u32 fragment_buffer_size = fragment_constants_size + (18 * 4 * sizeof(float));
const bool update_transform_constants = !!(m_graphics_state & rsx::pipeline_state::transform_constants_dirty);
if (manually_flush_ring_buffers)
{
m_vertex_state_buffer->reserve_storage_on_heap(512);
m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_buffer_size, 256));
if (m_transform_constants_dirty) m_transform_constants_buffer->reserve_storage_on_heap(8192);
if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
}
// Vertex state
@ -1112,7 +1113,7 @@ void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
*(reinterpret_cast<f32*>(buf + 144)) = rsx::method_registers.clip_max();
fill_vertex_layout_state(m_vertex_layout, upload_info.allocated_vertex_count, reinterpret_cast<s32*>(buf + 160), upload_info.persistent_mapping_offset, upload_info.volatile_mapping_offset);
if (m_transform_constants_dirty)
if (update_transform_constants)
{
// Vertex constants
mapping = m_transform_constants_buffer->alloc_from_heap(8192, m_uniform_buffer_offset_align);
@ -1137,17 +1138,17 @@ void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
m_vertex_state_buffer->bind_range(0, vertex_state_offset, 512);
m_fragment_constants_buffer->bind_range(2, fragment_constants_offset, fragment_buffer_size);
if (m_transform_constants_dirty) m_transform_constants_buffer->bind_range(1, vertex_constants_offset, 8192);
if (update_transform_constants) m_transform_constants_buffer->bind_range(1, vertex_constants_offset, 8192);
if (manually_flush_ring_buffers)
{
m_vertex_state_buffer->unmap();
m_fragment_constants_buffer->unmap();
if (m_transform_constants_dirty) m_transform_constants_buffer->unmap();
if (update_transform_constants) m_transform_constants_buffer->unmap();
}
m_transform_constants_dirty = false;
m_graphics_state = 0;
}
void GLGSRender::update_draw_state()

View File

@ -246,7 +246,8 @@ namespace rsx
m_rtts_dirty = true;
memset(m_textures_dirty, -1, sizeof(m_textures_dirty));
memset(m_vertex_textures_dirty, -1, sizeof(m_vertex_textures_dirty));
m_transform_constants_dirty = true;
m_graphics_state = pipeline_state::all_dirty;
}
thread::~thread()
@ -1329,10 +1330,10 @@ namespace rsx
void thread::get_current_vertex_program()
{
if (!m_vertex_program_dirty)
if (!(m_graphics_state & rsx::pipeline_state::vertex_program_dirty))
return;
m_vertex_program_dirty = false;
m_graphics_state &= ~(rsx::pipeline_state::vertex_program_dirty);
const u32 transform_program_start = rsx::method_registers.transform_program_start();
current_vertex_program.output_mask = rsx::method_registers.vertex_attrib_output_mask();
current_vertex_program.skip_vertex_input_check = false;
@ -1544,10 +1545,10 @@ namespace rsx
void thread::get_current_fragment_program(const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, rsx::limits::fragment_textures_count>& sampler_descriptors)
{
if (!m_fragment_program_dirty)
if (!(m_graphics_state & rsx::pipeline_state::fragment_program_dirty))
return;
m_fragment_program_dirty = false;
m_graphics_state &= ~(rsx::pipeline_state::fragment_program_dirty);
auto &result = current_fragment_program = {};
const u32 shader_program = rsx::method_registers.shader_program_address();

View File

@ -69,6 +69,18 @@ namespace rsx
context_clear_all = context_clear_color | context_clear_depth
};
enum pipeline_state : u8
{
fragment_program_dirty = 1,
vertex_program_dirty = 2,
fragment_state_dirty = 4,
vertex_state_dirty = 8,
transform_constants_dirty = 16,
invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty,
all_dirty = 255
};
u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size);
u32 get_address(u32 offset, u32 location);
@ -327,12 +339,10 @@ namespace rsx
u32 local_mem_addr, main_mem_addr;
bool m_rtts_dirty;
bool m_transform_constants_dirty;
bool m_textures_dirty[16];
bool m_vertex_textures_dirty[4];
bool m_framebuffer_state_contested = false;
bool m_fragment_program_dirty = false;
bool m_vertex_program_dirty = false;
u32 m_graphics_state = 0;
protected:
std::array<u32, 4> get_color_surface_addresses() const;

View File

@ -593,6 +593,8 @@ VKGSRender::VKGSRender() : GSRender()
m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0));
m_uniform_buffer_ring_info.init(VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "uniform buffer");
m_uniform_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0));
m_transform_constants_ring_info.init(VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "transform constants buffer");
m_transform_constants_ring_info.heap.reset(new vk::buffer(*m_device, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0));
m_index_buffer_ring_info.init(VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_index_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0));
m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
@ -688,6 +690,7 @@ VKGSRender::~VKGSRender()
//Heaps
m_index_buffer_ring_info.heap.reset();
m_uniform_buffer_ring_info.heap.reset();
m_transform_constants_ring_info.heap.reset();
m_attrib_ring_info.heap.reset();
m_texture_upload_buffer_ring_info.heap.reset();
@ -893,6 +896,7 @@ void VKGSRender::check_heap_status()
if (m_attrib_ring_info.is_critical() ||
m_texture_upload_buffer_ring_info.is_critical() ||
m_uniform_buffer_ring_info.is_critical() ||
m_transform_constants_ring_info.is_critical() ||
m_index_buffer_ring_info.is_critical())
{
std::chrono::time_point<steady_clock> submit_start = steady_clock::now();
@ -917,6 +921,7 @@ void VKGSRender::check_heap_status()
m_index_buffer_ring_info.reset_allocation_stats();
m_uniform_buffer_ring_info.reset_allocation_stats();
m_transform_constants_ring_info.reset_allocation_stats();
m_attrib_ring_info.reset_allocation_stats();
m_texture_upload_buffer_ring_info.reset_allocation_stats();
m_current_frame->reset_heap_ptrs();
@ -1938,6 +1943,7 @@ void VKGSRender::advance_queued_frames()
m_vertex_cache->purge();
m_current_frame->tag_frame_end(m_attrib_ring_info.get_current_put_pos_minus_one(),
m_uniform_buffer_ring_info.get_current_put_pos_minus_one(),
m_transform_constants_ring_info.get_current_put_pos_minus_one(),
m_index_buffer_ring_info.get_current_put_pos_minus_one(),
m_texture_upload_buffer_ring_info.get_current_put_pos_minus_one());
@ -2045,11 +2051,13 @@ void VKGSRender::process_swap_request(frame_context_t *ctx, bool free_resources)
//Heap cleanup; deallocates memory consumed by the frame if it is still held
m_attrib_ring_info.m_get_pos = ctx->attrib_heap_ptr;
m_uniform_buffer_ring_info.m_get_pos = ctx->ubo_heap_ptr;
m_transform_constants_ring_info.m_get_pos = ctx->vtxconst_heap_ptr;
m_index_buffer_ring_info.m_get_pos = ctx->index_heap_ptr;
m_texture_upload_buffer_ring_info.m_get_pos = ctx->texture_upload_heap_ptr;
m_attrib_ring_info.notify();
m_uniform_buffer_ring_info.notify();
m_transform_constants_ring_info.notify();
m_index_buffer_ring_info.notify();
m_texture_upload_buffer_ring_info.notify();
}
@ -2209,7 +2217,7 @@ bool VKGSRender::check_program_status()
void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
{
if (m_fragment_program_dirty || m_vertex_program_dirty)
if (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits)
{
get_current_fragment_program(fs_sampler_state);
verify(HERE), current_fragment_program.valid;
@ -2219,6 +2227,7 @@ void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
auto &vertex_program = current_vertex_program;
auto &fragment_program = current_fragment_program;
auto old_program = m_program;
vk::pipeline_props properties = {};
@ -2372,13 +2381,14 @@ void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
vk::leave_uninterruptible();
if (1)//m_graphics_state & (rsx::pipeline_state::fragment_state_dirty | rsx::pipeline_state::vertex_state_dirty))
{
const size_t fragment_constants_sz = m_prog_buffer->get_fragment_constants_buffer_size(fragment_program);
const size_t fragment_buffer_sz = fragment_constants_sz + (18 * 4 * sizeof(float));
const size_t required_mem = 512 + 8192 + fragment_buffer_sz;
const size_t required_mem = 512 + fragment_buffer_sz;
const size_t vertex_state_offset = m_uniform_buffer_ring_info.alloc<256>(required_mem);
const size_t vertex_constants_offset = vertex_state_offset + 512;
const size_t fragment_constants_offset = vertex_constants_offset + 8192;
const size_t fragment_constants_offset = vertex_state_offset + 512;
//We do this in one go
u8 *buf = (u8*)m_uniform_buffer_ring_info.map(vertex_state_offset, required_mem);
@ -2395,13 +2405,8 @@ void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
fill_vertex_layout_state(m_vertex_layout, vertex_info.allocated_vertex_count, reinterpret_cast<s32*>(buf + 160),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
//Vertex constants
buf = buf + 512;
fill_vertex_program_constants_data(buf);
m_transform_constants_dirty = false;
//Fragment constants
buf = buf + 8192;
buf = buf + 512;
if (fragment_constants_sz)
{
m_prog_buffer->fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) },
@ -2412,9 +2417,30 @@ void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
m_uniform_buffer_ring_info.unmap();
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_state_offset, 512 }, SCALE_OFFSET_BIND_SLOT, m_current_frame->descriptor_set);
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_constants_offset, 8192 }, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
m_vertex_state_buffer_info = { m_uniform_buffer_ring_info.heap->value, vertex_state_offset, 512 };
m_fragment_state_buffer_info = { m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz };
}
if (m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
{
//Vertex constants
const size_t vertex_constants_offset = m_transform_constants_ring_info.alloc<256>(8192);
auto buf = m_transform_constants_ring_info.map(vertex_constants_offset, 8192);
fill_vertex_program_constants_data(buf);
m_transform_constants_ring_info.unmap();
m_vertex_constants_buffer_info = { m_transform_constants_ring_info.heap->value, vertex_constants_offset, 8192 };
}
if (1)//m_graphics_state || old_program != m_program)
{
m_program->bind_uniform(m_vertex_state_buffer_info, SCALE_OFFSET_BIND_SLOT, m_current_frame->descriptor_set);
m_program->bind_uniform(m_vertex_constants_buffer_info, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
m_program->bind_uniform(m_fragment_state_buffer_info, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, m_current_frame->descriptor_set);
}
//Clear flags
m_graphics_state = 0;
}
static const u32 mr_color_offset[rsx::limits::color_buffers_count] =

View File

@ -40,7 +40,8 @@ namespace vk
//NOTE: Texture uploads can be huge, upto 16MB for a single texture (4096x4096px)
#define VK_ATTRIB_RING_BUFFER_SIZE_M 384
#define VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M 256
#define VK_UBO_RING_BUFFER_SIZE_M 128
#define VK_UBO_RING_BUFFER_SIZE_M 64
#define VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M 64
#define VK_INDEX_RING_BUFFER_SIZE_M 64
#define VK_MAX_ASYNC_CB_COUNT 64
@ -152,6 +153,7 @@ struct frame_context_t
//Heap pointers
s64 attrib_heap_ptr = 0;
s64 ubo_heap_ptr = 0;
s64 vtxconst_heap_ptr = 0;
s64 index_heap_ptr = 0;
s64 texture_upload_heap_ptr = 0;
@ -167,6 +169,7 @@ struct frame_context_t
attrib_heap_ptr = other.attrib_heap_ptr;
ubo_heap_ptr = other.attrib_heap_ptr;
vtxconst_heap_ptr = other.vtxconst_heap_ptr;
index_heap_ptr = other.attrib_heap_ptr;
texture_upload_heap_ptr = other.texture_upload_heap_ptr;
}
@ -178,10 +181,11 @@ struct frame_context_t
std::swap(samplers_to_clean, other.samplers_to_clean);
}
void tag_frame_end(s64 attrib_loc, s64 ubo_loc, s64 index_loc, s64 texture_loc)
void tag_frame_end(s64 attrib_loc, s64 ubo_loc, s64 vtxconst_loc, s64 index_loc, s64 texture_loc)
{
attrib_heap_ptr = attrib_loc;
ubo_heap_ptr = ubo_loc;
vtxconst_heap_ptr = vtxconst_loc;
index_heap_ptr = index_loc;
texture_upload_heap_ptr = texture_loc;
@ -314,9 +318,14 @@ private:
u64 m_last_heap_sync_time = 0;
vk::vk_data_heap m_attrib_ring_info;
vk::vk_data_heap m_uniform_buffer_ring_info;
vk::vk_data_heap m_transform_constants_ring_info;
vk::vk_data_heap m_index_buffer_ring_info;
vk::vk_data_heap m_texture_upload_buffer_ring_info;
VkDescriptorBufferInfo m_vertex_state_buffer_info;
VkDescriptorBufferInfo m_vertex_constants_buffer_info;
VkDescriptorBufferInfo m_fragment_state_buffer_info;
std::array<frame_context_t, VK_MAX_ASYNC_FRAMES> frame_context_storage;
//Temp frame context to use if the real frame queue is overburdened. Only used for storage
frame_context_t m_aux_frame_context;

View File

@ -342,12 +342,17 @@ namespace rsx
u32 load = rsx::method_registers.transform_constant_load();
if ((load + index) >= 512)
{
LOG_ERROR(RSX, "Invalid register index (load=%d, index=%d)", load, index);
LOG_ERROR(RSX, "Invalid transform register index (load=%d, index=%d)", load, index);
return;
}
rsx::method_registers.transform_constants[load + reg][subreg] = arg;
rsxthr->m_transform_constants_dirty = true;
auto &value = rsx::method_registers.transform_constants[load + reg][subreg];
if (value != arg)
{
//Transform constants invalidation is expensive (~8k bytes per update)
value = arg;
rsxthr->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
}
}
};
@ -357,19 +362,18 @@ namespace rsx
static void impl(thread* rsx, u32 _reg, u32 arg)
{
method_registers.commit_4_transform_program_instructions(index);
rsx->m_vertex_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty;
}
};
void set_transform_program_start(thread* rsx, u32, u32)
{
rsx->m_vertex_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty;
}
void set_vertex_attribute_output_mask(thread* rsx, u32, u32)
{
rsx->m_vertex_program_dirty = true;
rsx->m_fragment_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty | rsx::pipeline_state::fragment_program_dirty;
}
void set_begin_end(thread* rsxthr, u32 _reg, u32 arg)
@ -535,7 +539,7 @@ namespace rsx
void invalidate_L2(thread* rsx, u32, u32)
{
rsx->m_fragment_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::fragment_program_dirty;
}
void set_surface_dirty_bit(thread* rsx, u32, u32)
@ -556,7 +560,7 @@ namespace rsx
static void impl(thread* rsx, u32 _reg, u32 arg)
{
rsx->m_textures_dirty[index] = true;
rsx->m_fragment_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::fragment_program_dirty;
}
};
@ -584,7 +588,7 @@ namespace rsx
u32 address = get_address(method_registers.blit_engine_output_offset_nv3062() + pixel_offset + index * 4, method_registers.blit_engine_output_location_nv3062());
vm::write32(address, arg);
rsx->m_fragment_program_dirty = true;
rsx->m_graphics_state |= rsx::pipeline_state::fragment_program_dirty;
}
};
}