Revert "rsx/vk: Implement hardware instancing (#16466)"

This reverts commit 62701154f1a6ebd71499cabba2341cc240ff50c4.
This commit is contained in:
kd-11 2024-12-29 18:42:41 +03:00
parent 62701154f1
commit 43e04f3fc7
32 changed files with 906 additions and 1411 deletions

View File

@ -476,7 +476,6 @@ target_sources(rpcs3_emu PRIVATE
RSX/Common/TextureUtils.cpp
RSX/Common/texture_cache.cpp
RSX/Core/RSXContext.cpp
RSX/Core/RSXDrawCommands.cpp
RSX/gcm_enums.cpp
RSX/gcm_printing.cpp
RSX/GL/GLCommonDecompiler.cpp

View File

@ -1,843 +0,0 @@
#include "stdafx.h"
#include "RSXDrawCommands.h"
#include "Emu/RSX/Common/BufferUtils.h"
#include "Emu/RSX/Common/buffer_stream.hpp"
#include "Emu/RSX/Common/io_buffer.h"
#include "Emu/RSX/Common/simple_array.hpp"
#include "Emu/RSX/NV47/HW/context_accessors.define.h"
#include "Emu/RSX/Program/GLSLCommon.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/RSX/RSXThread.h"
#include "Emu/Memory/vm.h"
namespace rsx
{
void draw_command_processor::analyse_inputs_interleaved(vertex_input_layout& result, const vertex_program_metadata_t& vp_metadata)
{
const rsx_state& state = *REGS(m_ctx);
const u32 input_mask = state.vertex_attrib_input_mask() & vp_metadata.referenced_inputs_mask;
result.clear();
result.attribute_mask = static_cast<u16>(input_mask);
if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
{
interleaved_range_info& info = *result.alloc_interleaved_block();
info.interleaved = true;
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
auto& vinfo = state.vertex_arrays_info[index];
result.attribute_placement[index] = attribute_buffer_placement::none;
if (vinfo.size() > 0)
{
// Stride must be updated even if the stream is disabled
info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size());
info.locations.push_back({ index, false, 1 });
if (input_mask & (1u << index))
{
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
}
else if (state.register_vertex_info[index].size > 0 && input_mask & (1u << index))
{
// Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
}
if (info.attribute_stride)
{
// At least one array feed must be enabled for vertex input
result.interleaved_blocks.push_back(&info);
}
return;
}
const u32 frequency_divider_mask = REGS(m_ctx)->frequency_divider_operation_mask();
result.interleaved_blocks.reserve(16);
result.referenced_registers.reserve(16);
for (auto [ref_mask, index] = std::tuple{ input_mask, u8(0) }; ref_mask; ++index, ref_mask >>= 1)
{
ensure(index < rsx::limits::vertex_count);
if (!(ref_mask & 1u))
{
// Nothing to do, uninitialized
continue;
}
// Always reset attribute placement by default
result.attribute_placement[index] = attribute_buffer_placement::none;
// Check for interleaving
if (REGS(m_ctx)->current_draw_clause.is_immediate_draw &&
REGS(m_ctx)->current_draw_clause.command != rsx::draw_command::indexed)
{
// NOTE: In immediate rendering mode, all vertex setup is ignored
// Observed with GT5, immediate render bypasses array pointers completely, even falling back to fixed-function register defaults
if (m_vertex_push_buffers[index].vertex_count > 1)
{
// Ensure consistent number of vertices per attribute.
m_vertex_push_buffers[index].pad_to(m_vertex_push_buffers[0].vertex_count, false);
// Read temp buffer (register array)
std::pair<u8, u32> volatile_range_info = std::make_pair(index, static_cast<u32>(m_vertex_push_buffers[index].data.size() * sizeof(u32)));
result.volatile_blocks.push_back(volatile_range_info);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
else if (state.register_vertex_info[index].size > 0)
{
// Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
// Fall back to the default register value if no source is specified via register
continue;
}
const auto& info = state.vertex_arrays_info[index];
if (!info.size())
{
if (state.register_vertex_info[index].size > 0)
{
// Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
continue;
}
}
else
{
result.attribute_placement[index] = attribute_buffer_placement::persistent;
const u32 base_address = info.offset() & 0x7fffffff;
bool alloc_new_block = true;
bool modulo = !!(frequency_divider_mask & (1 << index));
for (auto& block : result.interleaved_blocks)
{
if (block->single_vertex)
{
// Single vertex definition, continue
continue;
}
if (block->attribute_stride != info.stride())
{
// Stride does not match, continue
continue;
}
if (base_address > block->base_offset)
{
const u32 diff = base_address - block->base_offset;
if (diff > info.stride())
{
// Not interleaved, continue
continue;
}
}
else
{
const u32 diff = block->base_offset - base_address;
if (diff > info.stride())
{
// Not interleaved, continue
continue;
}
// Matches, and this address is lower than existing
block->base_offset = base_address;
}
alloc_new_block = false;
block->locations.push_back({ index, modulo, info.frequency() });
block->interleaved = true;
break;
}
if (alloc_new_block)
{
interleaved_range_info& block = *result.alloc_interleaved_block();
block.base_offset = base_address;
block.attribute_stride = info.stride();
block.memory_location = info.offset() >> 31;
block.locations.reserve(16);
block.locations.push_back({ index, modulo, info.frequency() });
if (block.attribute_stride == 0)
{
block.single_vertex = true;
block.attribute_stride = rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
result.interleaved_blocks.push_back(&block);
}
}
}
for (auto& info : result.interleaved_blocks)
{
// Calculate real data address to be used during upload
info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info->base_offset), info->memory_location);
}
}
std::span<const std::byte> draw_command_processor::get_raw_index_array(const draw_clause& draw_indexed_clause) const
{
if (!m_element_push_buffer.empty()) [[ unlikely ]]
{
// Indices provided via immediate mode
return { reinterpret_cast<const std::byte*>(m_element_push_buffer.data()), ::narrow<u32>(m_element_push_buffer.size() * sizeof(u32)) };
}
const rsx::index_array_type type = REGS(m_ctx)->index_type();
const u32 type_size = get_index_type_size(type);
// Force aligned indices as realhw
const u32 address = (0 - type_size) & get_address(REGS(m_ctx)->index_array_address(), REGS(m_ctx)->index_array_location());
const u32 first = draw_indexed_clause.min_index();
const u32 count = draw_indexed_clause.get_elements_count();
const auto ptr = vm::_ptr<const std::byte>(address);
return { ptr + first * type_size, count * type_size };
}
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
draw_command_processor::get_draw_command(const rsx::rsx_state& state) const
{
if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]]
{
return draw_indexed_array_command
{
get_raw_index_array(state.current_draw_clause)
};
}
if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::array)
{
return draw_array_command{};
}
if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
{
return draw_inlined_array{};
}
fmt::throw_exception("ill-formed draw command");
}
void draw_command_processor::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value)
{
if (!(REGS(m_ctx)->vertex_attrib_input_mask() & (1 << attribute)))
{
return;
}
// Enforce ATTR0 as vertex attribute for push buffers.
// This whole thing becomes a mess if we don't have a provoking attribute.
const auto vertex_id = m_vertex_push_buffers[0].get_vertex_id();
m_vertex_push_buffers[attribute].set_vertex_data(attribute, vertex_id, subreg_index, type, size, value);
RSX(m_ctx)->m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty;
}
u32 draw_command_processor::get_push_buffer_vertex_count() const
{
// Enforce ATTR0 as vertex attribute for push buffers.
// This whole thing becomes a mess if we don't have a provoking attribute.
return m_vertex_push_buffers[0].vertex_count;
}
void draw_command_processor::append_array_element(u32 index)
{
// Endianness is swapped because common upload code expects input in BE
// TODO: Implement fast upload path for LE inputs and do away with this
m_element_push_buffer.push_back(std::bit_cast<u32, be_t<u32>>(index));
}
u32 draw_command_processor::get_push_buffer_index_count() const
{
return ::size32(m_element_push_buffer);
}
void draw_command_processor::clear_push_buffers()
{
auto& graphics_state = RSX(m_ctx)->m_graphics_state;
if (graphics_state & rsx::pipeline_state::push_buffer_arrays_dirty)
{
for (auto& push_buf : m_vertex_push_buffers)
{
//Disabled, see https://github.com/RPCS3/rpcs3/issues/1932
//REGS(m_ctx)->register_vertex_info[index].size = 0;
push_buf.clear();
}
graphics_state.clear(rsx::pipeline_state::push_buffer_arrays_dirty);
}
m_element_push_buffer.clear();
}
void draw_command_processor::fill_vertex_layout_state(
const vertex_input_layout& layout,
const vertex_program_metadata_t& vp_metadata,
u32 first_vertex,
u32 vertex_count,
s32* buffer,
u32 persistent_offset_base,
u32 volatile_offset_base) const
{
std::array<s32, 16> offset_in_block = {};
u32 volatile_offset = volatile_offset_base;
u32 persistent_offset = persistent_offset_base;
// NOTE: Order is important! Transient ayout is always push_buffers followed by register data
if (REGS(m_ctx)->current_draw_clause.is_immediate_draw)
{
for (const auto& info : layout.volatile_blocks)
{
offset_in_block[info.first] = volatile_offset;
volatile_offset += info.second;
}
}
for (u8 index : layout.referenced_registers)
{
offset_in_block[index] = volatile_offset;
volatile_offset += 16;
}
if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
{
const auto& block = layout.interleaved_blocks[0];
u32 inline_data_offset = volatile_offset;
for (const auto& attrib : block->locations)
{
auto& info = REGS(m_ctx)->vertex_arrays_info[attrib.index];
offset_in_block[attrib.index] = inline_data_offset;
inline_data_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
}
else
{
for (const auto& block : layout.interleaved_blocks)
{
for (const auto& attrib : block->locations)
{
const u32 local_address = (REGS(m_ctx)->vertex_arrays_info[attrib.index].offset() & 0x7fffffff);
offset_in_block[attrib.index] = persistent_offset + (local_address - block->base_offset);
}
const auto range = block->calculate_required_range(first_vertex, vertex_count);
persistent_offset += block->attribute_stride * range.second;
}
}
// Fill the data
// Each descriptor field is 64 bits wide
// [0-8] attribute stride
// [8-24] attribute divisor
// [24-27] attribute type
// [27-30] attribute size
// [30-31] reserved
// [31-60] starting offset
// [60-21] swap bytes flag
// [61-22] volatile flag
// [62-63] modulo enable flag
const s32 default_frequency_mask = (1 << 8);
const s32 swap_storage_mask = (1 << 29);
const s32 volatile_storage_mask = (1 << 30);
const s32 modulo_op_frequency_mask = smin;
const u32 modulo_mask = REGS(m_ctx)->frequency_divider_operation_mask();
const auto max_index = (first_vertex + vertex_count) - 1;
for (u16 ref_mask = vp_metadata.referenced_inputs_mask, index = 0; ref_mask; ++index, ref_mask >>= 1)
{
if (!(ref_mask & 1u))
{
// Unused input, ignore this
continue;
}
if (layout.attribute_placement[index] == attribute_buffer_placement::none)
{
static constexpr u64 zero = 0;
std::memcpy(buffer + index * 2, &zero, sizeof(zero));
continue;
}
rsx::vertex_base_type type = {};
s32 size = 0;
s32 attrib0 = 0;
s32 attrib1 = 0;
if (layout.attribute_placement[index] == attribute_buffer_placement::transient)
{
if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
{
const auto& info = REGS(m_ctx)->vertex_arrays_info[index];
if (!info.size())
{
// Register
const auto& reginfo = REGS(m_ctx)->register_vertex_info[index];
type = reginfo.type;
size = reginfo.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size);
}
else
{
// Array
type = info.type();
size = info.size();
attrib0 = layout.interleaved_blocks[0]->attribute_stride | default_frequency_mask;
}
}
else
{
// Data is either from an immediate render or register input
// Immediate data overrides register input
if (REGS(m_ctx)->current_draw_clause.is_immediate_draw &&
m_vertex_push_buffers[index].vertex_count > 1)
{
// Push buffer
const auto& info = m_vertex_push_buffers[index];
type = info.type;
size = info.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size) | default_frequency_mask;
}
else
{
// Register
const auto& info = REGS(m_ctx)->register_vertex_info[index];
type = info.type;
size = info.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size);
}
}
attrib1 |= volatile_storage_mask;
}
else
{
auto& info = REGS(m_ctx)->vertex_arrays_info[index];
type = info.type();
size = info.size();
auto stride = info.stride();
attrib0 = stride;
if (stride > 0) // when stride is 0, input is not an array but a single element
{
const u32 frequency = info.frequency();
switch (frequency)
{
case 0:
case 1:
{
attrib0 |= default_frequency_mask;
break;
}
default:
{
if (modulo_mask & (1 << index))
{
if (max_index >= frequency)
{
// Only set modulo mask if a modulo op is actually necessary!
// This requires that the uploaded range for this attr = [0, freq-1]
// Ignoring modulo op if the rendered range does not wrap allows for range optimization
attrib0 |= (frequency << 8);
attrib1 |= modulo_op_frequency_mask;
}
else
{
attrib0 |= default_frequency_mask;
}
}
else
{
// Division
attrib0 |= (frequency << 8);
}
break;
}
}
}
} // end attribute placement check
// Special compressed 4 components into one 4-byte value. Decoded as one value.
if (type == rsx::vertex_base_type::cmp)
{
size = 1;
}
// All data is passed in in PS3-native order (BE) so swap flag should be set
attrib1 |= swap_storage_mask;
attrib0 |= (static_cast<s32>(type) << 24);
attrib0 |= (size << 27);
attrib1 |= offset_in_block[index];
buffer[index * 2 + 0] = attrib0;
buffer[index * 2 + 1] = attrib1;
}
}
void draw_command_processor::write_vertex_data_to_memory(
const vertex_input_layout& layout,
u32 first_vertex,
u32 vertex_count,
void* persistent_data,
void* volatile_data) const
{
auto transient = static_cast<char*>(volatile_data);
auto persistent = static_cast<char*>(persistent_data);
auto& draw_call = REGS(m_ctx)->current_draw_clause;
if (transient != nullptr)
{
if (draw_call.command == rsx::draw_command::inlined_array)
{
for (const u8 index : layout.referenced_registers)
{
memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16);
transient += 16;
}
memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32));
// Is it possible to reference data outside of the inlined array?
return;
}
// NOTE: Order is important! Transient layout is always push_buffers followed by register data
if (draw_call.is_immediate_draw)
{
// NOTE: It is possible for immediate draw to only contain index data, so vertex data can be in persistent memory
for (const auto& info : layout.volatile_blocks)
{
memcpy(transient, m_vertex_push_buffers[info.first].data.data(), info.second);
transient += info.second;
}
}
for (const u8 index : layout.referenced_registers)
{
memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16);
transient += 16;
}
}
if (persistent != nullptr)
{
for (interleaved_range_info* block : layout.interleaved_blocks)
{
auto range = block->calculate_required_range(first_vertex, vertex_count);
const u32 data_size = range.second * block->attribute_stride;
const u32 vertex_base = range.first * block->attribute_stride;
g_fxo->get<rsx::dma_manager>().copy(persistent, vm::_ptr<char>(block->real_offset_address) + vertex_base, data_size);
persistent += data_size;
}
}
}
void draw_command_processor::fill_scale_offset_data(void* buffer, bool flip_y) const
{
const int clip_w = REGS(m_ctx)->surface_clip_width();
const int clip_h = REGS(m_ctx)->surface_clip_height();
const float scale_x = REGS(m_ctx)->viewport_scale_x() / (clip_w / 2.f);
float offset_x = REGS(m_ctx)->viewport_offset_x() - (clip_w / 2.f);
offset_x /= clip_w / 2.f;
float scale_y = REGS(m_ctx)->viewport_scale_y() / (clip_h / 2.f);
float offset_y = (REGS(m_ctx)->viewport_offset_y() - (clip_h / 2.f));
offset_y /= clip_h / 2.f;
if (flip_y) scale_y *= -1;
if (flip_y) offset_y *= -1;
const float scale_z = REGS(m_ctx)->viewport_scale_z();
const float offset_z = REGS(m_ctx)->viewport_offset_z();
const float one = 1.f;
utils::stream_vector(buffer, std::bit_cast<u32>(scale_x), 0, 0, std::bit_cast<u32>(offset_x));
utils::stream_vector(static_cast<char*>(buffer) + 16, 0, std::bit_cast<u32>(scale_y), 0, std::bit_cast<u32>(offset_y));
utils::stream_vector(static_cast<char*>(buffer) + 32, 0, 0, std::bit_cast<u32>(scale_z), std::bit_cast<u32>(offset_z));
utils::stream_vector(static_cast<char*>(buffer) + 48, 0, 0, 0, std::bit_cast<u32>(one));
}
void draw_command_processor::fill_user_clip_data(void* buffer) const
{
const rsx::user_clip_plane_op clip_plane_control[6] =
{
REGS(m_ctx)->clip_plane_0_enabled(),
REGS(m_ctx)->clip_plane_1_enabled(),
REGS(m_ctx)->clip_plane_2_enabled(),
REGS(m_ctx)->clip_plane_3_enabled(),
REGS(m_ctx)->clip_plane_4_enabled(),
REGS(m_ctx)->clip_plane_5_enabled(),
};
u8 data_block[64];
s32* clip_enabled_flags = reinterpret_cast<s32*>(data_block);
f32* clip_distance_factors = reinterpret_cast<f32*>(data_block + 32);
for (int index = 0; index < 6; ++index)
{
switch (clip_plane_control[index])
{
default:
rsx_log.error("bad clip plane control (0x%x)", static_cast<u8>(clip_plane_control[index]));
[[fallthrough]];
case rsx::user_clip_plane_op::disable:
clip_enabled_flags[index] = 0;
clip_distance_factors[index] = 0.f;
break;
case rsx::user_clip_plane_op::greater_or_equal:
clip_enabled_flags[index] = 1;
clip_distance_factors[index] = 1.f;
break;
case rsx::user_clip_plane_op::less_than:
clip_enabled_flags[index] = 1;
clip_distance_factors[index] = -1.f;
break;
}
}
memcpy(buffer, data_block, 2 * 8 * sizeof(u32));
}
/**
* Fill buffer with vertex program constants.
* Buffer must be at least 512 float4 wide.
*/
void draw_command_processor::fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table) const
{
if (!reloc_table.empty()) [[ likely ]]
{
char* dst = reinterpret_cast<char*>(buffer);
for (const auto& index : reloc_table)
{
utils::stream_vector_from_memory(dst, &REGS(m_ctx)->transform_constants[index]);
dst += 16;
}
}
else
{
memcpy(buffer, REGS(m_ctx)->transform_constants.data(), 468 * 4 * sizeof(float));
}
}
void draw_command_processor::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/) const
{
ROP_control_t rop_control{};
if (REGS(m_ctx)->alpha_test_enabled())
{
const u32 alpha_func = static_cast<u32>(REGS(m_ctx)->alpha_func());
rop_control.set_alpha_test_func(alpha_func);
rop_control.enable_alpha_test();
}
if (REGS(m_ctx)->polygon_stipple_enabled())
{
rop_control.enable_polygon_stipple();
}
if (REGS(m_ctx)->msaa_alpha_to_coverage_enabled() && !RSX(m_ctx)->get_backend_config().supports_hw_a2c)
{
// TODO: Properly support alpha-to-coverage and alpha-to-one behavior in shaders
// Alpha values generate a coverage mask for order independent blending
// Requires hardware AA to work properly (or just fragment sample stage in fragment shaders)
// Simulated using combined alpha blend and alpha test
rop_control.enable_alpha_to_coverage();
if (REGS(m_ctx)->msaa_sample_mask())
{
rop_control.enable_MSAA_writes();
}
// Sample configuration bits
switch (REGS(m_ctx)->surface_antialias())
{
case rsx::surface_antialiasing::center_1_sample:
break;
case rsx::surface_antialiasing::diagonal_centered_2_samples:
rop_control.set_msaa_control(1u);
break;
default:
rop_control.set_msaa_control(3u);
break;
}
}
const f32 fog0 = REGS(m_ctx)->fog_params_0();
const f32 fog1 = REGS(m_ctx)->fog_params_1();
const u32 fog_mode = static_cast<u32>(REGS(m_ctx)->fog_equation());
// Check if framebuffer is actually an XRGB format and not a WZYX format
switch (REGS(m_ctx)->surface_color())
{
case rsx::surface_color_format::w16z16y16x16:
case rsx::surface_color_format::w32z32y32x32:
case rsx::surface_color_format::x32:
// These behave very differently from "normal" formats.
break;
default:
// Integer framebuffer formats.
rop_control.enable_framebuffer_INT();
// Check if we want sRGB conversion.
if (REGS(m_ctx)->framebuffer_srgb_enabled())
{
rop_control.enable_framebuffer_sRGB();
}
break;
}
// Generate wpos coefficients
// wpos equation is now as follows:
// wpos.y = (frag_coord / resolution_scale) * ((window_origin!=top)?-1.: 1.) + ((window_origin!=top)? window_height : 0)
// wpos.x = (frag_coord / resolution_scale)
// wpos.zw = frag_coord.zw
const auto window_origin = REGS(m_ctx)->shader_window_origin();
const u32 window_height = REGS(m_ctx)->shader_window_height();
const f32 resolution_scale = (window_height <= static_cast<u32>(g_cfg.video.min_scalable_dimension)) ? 1.f : rsx::get_resolution_scale();
const f32 wpos_scale = (window_origin == rsx::window_origin::top) ? (1.f / resolution_scale) : (-1.f / resolution_scale);
const f32 wpos_bias = (window_origin == rsx::window_origin::top) ? 0.f : window_height;
const f32 alpha_ref = REGS(m_ctx)->alpha_ref();
u32* dst = static_cast<u32*>(buffer);
utils::stream_vector(dst, std::bit_cast<u32>(fog0), std::bit_cast<u32>(fog1), rop_control.value, std::bit_cast<u32>(alpha_ref));
utils::stream_vector(dst + 4, 0u, fog_mode, std::bit_cast<u32>(wpos_scale), std::bit_cast<u32>(wpos_bias));
}
void draw_command_processor::fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const
{
auto& draw_call = REGS(m_ctx)->current_draw_clause;
// Only call this for instanced draws!
ensure(draw_call.is_trivial_instanced_draw);
// Temp indirection table. Used to track "running" updates.
rsx::simple_array<u32> instancing_indirection_table;
// indirection table size
const auto reloc_table = prog.has_indexed_constants ? decltype(prog.constant_ids){} : prog.constant_ids;
const auto redirection_table_size = prog.has_indexed_constants ? 468u : ::size32(prog.constant_ids);
instancing_indirection_table.resize(redirection_table_size);
// Temp constants data
rsx::simple_array<u128> constants_data;
constants_data.reserve(redirection_table_size * draw_call.pass_count());
// Allocate indirection buffer on GPU stream
indirection_table_buf.reserve(instancing_indirection_table.size_bytes() * draw_call.pass_count());
auto indirection_out = indirection_table_buf.data<u32>();
rsx::instanced_draw_config_t instance_config;
u32 indirection_table_offset = 0;
// We now replay the draw call here to pack the data.
draw_call.begin();
// Write initial draw data.
std::iota(instancing_indirection_table.begin(), instancing_indirection_table.end(), 0);
constants_data.resize(redirection_table_size);
fill_vertex_program_constants_data(constants_data.data(), reloc_table);
// Next draw. We're guaranteed more than one draw call by the caller.
draw_call.next();
do
{
// Write previous state
std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes());
indirection_table_offset += redirection_table_size;
// Decode next draw state
instance_config = {};
draw_call.execute_pipeline_dependencies(m_ctx, &instance_config);
if (!instance_config.transform_constants_data_changed)
{
continue;
}
const int translated_offset = prog.has_indexed_constants
? instance_config.patch_load_offset
: prog.TranslateConstantsRange(instance_config.patch_load_offset, instance_config.patch_load_count);
if (translated_offset >= 0)
{
// Trivially patchable in bulk
const u32 redirection_loc = ::size32(constants_data);
constants_data.resize(::size32(constants_data) + instance_config.patch_load_count);
std::memcpy(constants_data.data() + redirection_loc, &REGS(m_ctx)->transform_constants[instance_config.patch_load_offset], instance_config.patch_load_count * sizeof(u128));
// Update indirection table
for (auto i = translated_offset, count = 0;
static_cast<u32>(count) < instance_config.patch_load_count;
++i, ++count)
{
instancing_indirection_table[i] = redirection_loc + count;
}
continue;
}
ensure(!prog.has_indexed_constants);
// Sparse update. Update records individually instead of bulk
// FIXME: Range batching optimization
const auto load_end = instance_config.patch_load_offset + instance_config.patch_load_count;
for (u32 i = 0; i < redirection_table_size; ++i)
{
const auto read_index = prog.constant_ids[i];
if (read_index < instance_config.patch_load_offset || read_index >= load_end)
{
// Reading outside "hot" range.
continue;
}
const u32 redirection_loc = ::size32(constants_data);
constants_data.resize(::size32(constants_data) + 1);
std::memcpy(constants_data.data() + redirection_loc, &REGS(m_ctx)->transform_constants[read_index], sizeof(u128));
instancing_indirection_table[i] = redirection_loc;
}
} while (draw_call.next());
// Tail
ensure(indirection_table_offset < (instancing_indirection_table.size() * draw_call.pass_count()));
std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes());
// Now write the constants to the GPU buffer
constants_data_array_buffer.reserve(constants_data.size_bytes());
std::memcpy(constants_data_array_buffer.data(), constants_data.data(), constants_data.size_bytes());
}
}

View File

@ -1,110 +0,0 @@
#pragma once
#include <util/types.hpp>
#include "Emu/RSX/Core/RSXVertexTypes.h"
#include "Emu/RSX/NV47/FW/draw_call.hpp"
#include "Emu/RSX/Program/ProgramStateCache.h"
#include "Emu/RSX/rsx_vertex_data.h"
#include <span>
#include <variant>
namespace rsx
{
struct rsx_state;
struct context;
class io_buffer;
class draw_command_processor
{
using vertex_program_metadata_t = program_hash_util::vertex_program_utils::vertex_program_metadata;
context* m_ctx = nullptr;
protected:
friend class thread;
std::array<push_buffer_vertex_info, 16> m_vertex_push_buffers;
rsx::simple_array<u32> m_element_push_buffer;
public:
draw_command_processor() = default;
void init(context* ctx)
{
m_ctx = ctx;
}
// Analyze vertex inputs and group all interleaved blocks
void analyse_inputs_interleaved(vertex_input_layout& layout, const vertex_program_metadata_t& vp_metadata);
// Retrieve raw bytes for the index array (untyped)
std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
// Get compiled draw command for backend rendering
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
get_draw_command(const rsx::rsx_state& state) const;
// Push-buffers for immediate rendering (begin-end scopes)
void append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value);
u32 get_push_buffer_vertex_count() const;
void append_array_element(u32 index);
u32 get_push_buffer_index_count() const;
void clear_push_buffers();
const std::span<const u32> element_push_buffer() const
{
return m_element_push_buffer;
}
// Host driver helpers
void fill_vertex_layout_state(
const vertex_input_layout& layout,
const vertex_program_metadata_t& vp_metadata,
u32 first_vertex,
u32 vertex_count,
s32* buffer,
u32 persistent_offset_base,
u32 volatile_offset_base) const;
void write_vertex_data_to_memory(
const vertex_input_layout& layout,
u32 first_vertex,
u32 vertex_count,
void* persistent_data,
void* volatile_data) const;
/**
* Fill buffer with 4x4 scale offset matrix.
* Vertex shader's position is to be multiplied by this matrix.
* if flip_y is set, the matrix is modified to use d3d convention.
*/
void fill_scale_offset_data(void* buffer, bool flip_y) const;
/**
* Fill buffer with user clip information
*/
void fill_user_clip_data(void* buffer) const;
/**
* Fill buffer with vertex program constants.
* Relocation table allows to do a partial fill with only selected registers.
*/
void fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table) const;
/**
* Fill buffer with fragment rasterization state.
* Fills current fog values, alpha test parameters and texture scaling parameters
*/
void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program) const;
// Fill instancing buffers. A single iobuf is used for both. 256byte alignment enforced to allow global bind
// Returns offsets to the index redirection lookup table and constants field array
void fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const;
};
}

View File

@ -1,54 +0,0 @@
#pragma once
#include <util/types.hpp>
namespace rsx
{
enum pipeline_state : u32
{
fragment_program_ucode_dirty = (1 << 0), // Fragment program ucode changed
vertex_program_ucode_dirty = (1 << 1), // Vertex program ucode changed
fragment_program_state_dirty = (1 << 2), // Fragment program state changed
vertex_program_state_dirty = (1 << 3), // Vertex program state changed
fragment_state_dirty = (1 << 4), // Fragment state changed (alpha test, etc)
vertex_state_dirty = (1 << 5), // Vertex state changed (scale_offset, clip planes, etc)
transform_constants_dirty = (1 << 6), // Transform constants changed
fragment_constants_dirty = (1 << 7), // Fragment constants changed
framebuffer_reads_dirty = (1 << 8), // Framebuffer contents changed
fragment_texture_state_dirty = (1 << 9), // Fragment texture parameters changed
vertex_texture_state_dirty = (1 << 10), // Fragment texture parameters changed
scissor_config_state_dirty = (1 << 11), // Scissor region changed
zclip_config_state_dirty = (1 << 12), // Viewport Z clip changed
scissor_setup_invalid = (1 << 13), // Scissor configuration is broken
scissor_setup_clipped = (1 << 14), // Scissor region is cropped by viewport constraint
polygon_stipple_pattern_dirty = (1 << 15), // Rasterizer stippling pattern changed
line_stipple_pattern_dirty = (1 << 16), // Line stippling pattern changed
push_buffer_arrays_dirty = (1 << 17), // Push buffers have data written to them (immediate mode vertex buffers)
polygon_offset_state_dirty = (1 << 18), // Polygon offset config was changed
depth_bounds_state_dirty = (1 << 19), // Depth bounds configuration changed
pipeline_config_dirty = (1 << 20), // Generic pipeline configuration changes. Shader peek hint.
rtt_config_dirty = (1 << 21), // Render target configuration changed
rtt_config_contested = (1 << 22), // Render target configuration is indeterminate
rtt_config_valid = (1 << 23), // Render target configuration is valid
rtt_cache_state_dirty = (1 << 24), // Texture cache state is indeterminate
xform_instancing_state_dirty = (1 << 25), // Transform instancing state has changed
fragment_program_dirty = fragment_program_ucode_dirty | fragment_program_state_dirty,
vertex_program_dirty = vertex_program_ucode_dirty | vertex_program_state_dirty,
invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty | xform_instancing_state_dirty,
invalidate_zclip_bits = vertex_state_dirty | zclip_config_state_dirty,
memory_barrier_bits = framebuffer_reads_dirty,
// Vulkan-specific signals
invalidate_vk_dynamic_state = zclip_config_state_dirty | scissor_config_state_dirty | polygon_offset_state_dirty | depth_bounds_state_dirty,
all_dirty = ~0u
};
}

View File

@ -513,7 +513,7 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (vertex_state & rsx::vertex_arrays_changed)
{
m_draw_processor.analyse_inputs_interleaved(m_vertex_layout, current_vp_metadata);
analyse_inputs_interleaved(m_vertex_layout);
}
else if (vertex_state & rsx::vertex_base_changed)
{

View File

@ -840,8 +840,8 @@ void GLGSRender::load_program_env()
// Vertex state
auto mapping = m_vertex_env_buffer->alloc_from_heap(144, m_uniform_buffer_offset_align);
auto buf = static_cast<u8*>(mapping.first);
m_draw_processor.fill_scale_offset_data(buf, false);
m_draw_processor.fill_user_clip_data(buf + 64);
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<f32*>(buf + 132)) = rsx::method_registers.point_size() * rsx::get_resolution_scale();
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.clip_min();
@ -887,7 +887,7 @@ void GLGSRender::load_program_env()
// Fragment state
auto mapping = m_fragment_env_buffer->alloc_from_heap(32, m_uniform_buffer_offset_align);
auto buf = static_cast<u8*>(mapping.first);
m_draw_processor.fill_fragment_state_buffer(buf, current_fragment_program);
fill_fragment_state_buffer(buf, current_fragment_program);
m_fragment_env_buffer->bind_range(GL_FRAGMENT_STATE_BIND_SLOT, mapping.second, 32);
}
@ -988,7 +988,7 @@ void GLGSRender::upload_transform_constants(const rsx::io_buffer& buffer)
: std::span<const u16>(m_vertex_prog->constant_ids);
buffer.reserve(transform_constants_size);
m_draw_processor.fill_vertex_program_constants_data(buffer.data(), constant_ids);
fill_vertex_program_constants_data(buffer.data(), constant_ids);
}
}
@ -1007,14 +1007,7 @@ void GLGSRender::update_vertex_env(const gl::vertex_upload_info& upload_info)
buf[1] = upload_info.vertex_index_offset;
buf += 4;
m_draw_processor.fill_vertex_layout_state(
m_vertex_layout,
current_vp_metadata,
upload_info.first_vertex,
upload_info.allocated_vertex_count,
reinterpret_cast<s32*>(buf),
upload_info.persistent_mapping_offset,
upload_info.volatile_mapping_offset);
fill_vertex_layout_state(m_vertex_layout, upload_info.first_vertex, upload_info.allocated_vertex_count, reinterpret_cast<s32*>(buf), upload_info.persistent_mapping_offset, upload_info.volatile_mapping_offset);
m_vertex_layout_buffer->bind_range(GL_VERTEX_LAYOUT_BIND_SLOT, mapping.second, 128 + 16);

View File

@ -153,7 +153,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
m_profiler.start();
//Write index buffers and count verts
auto result = std::visit(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), m_draw_processor.get_draw_command(rsx::method_registers));
auto result = std::visit(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), get_draw_command(rsx::method_registers));
const u32 vertex_count = (result.max_index - result.min_index) + 1;
u32 vertex_base = result.min_index;
@ -250,7 +250,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
}
//Write all the data
m_draw_processor.write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping.first, volatile_mapping.first);
write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping.first, volatile_mapping.first);
m_frame_stats.vertex_upload_time += m_profiler.duration();
return upload_info;

View File

@ -89,52 +89,6 @@ namespace rsx
}
}
bool draw_clause::check_trivially_instanced() const
{
if (pass_count() <= 1)
{
// Cannot instance one draw call or less
return false;
}
// For instancing all draw calls must be identical
const auto& ref = draw_command_ranges.front();
for (const auto& range : draw_command_ranges)
{
if (range.first != ref.first || range.count != ref.count)
{
return false;
}
}
if (draw_command_barriers.empty())
{
// Raise alarm here for investigation, we may be missing a corner case.
rsx_log.error("Instanced draw detected, but no command barriers found!");
return false;
}
// Barriers must exist, but can only involve updating transform constants (for now)
for (const auto& barrier : draw_command_barriers)
{
if (barrier.type != rsx::transform_constant_load_modifier_barrier &&
barrier.type != rsx::transform_constant_update_barrier)
{
ensure(barrier.draw_id < ::size32(draw_command_ranges));
if (draw_command_ranges[barrier.draw_id].count == 0)
{
// Dangling command barriers are ignored. We're also at the end of the command, so abort.
break;
}
// Fail. Only transform constant instancing is supported at the moment.
return false;
}
}
return true;
}
void draw_clause::reset(primitive_type type)
{
current_range_index = ~0u;
@ -143,7 +97,6 @@ namespace rsx
command = draw_command::none;
primitive = type;
primitive_barrier_enable = false;
is_trivial_instanced_draw = false;
draw_command_ranges.clear();
draw_command_barriers.clear();
@ -152,7 +105,7 @@ namespace rsx
is_disjoint_primitive = is_primitive_disjointed(primitive);
}
u32 draw_clause::execute_pipeline_dependencies(context* ctx, instanced_draw_config_t* instance_config) const
u32 draw_clause::execute_pipeline_dependencies(context* ctx) const
{
u32 result = 0u;
for (;
@ -198,20 +151,7 @@ namespace rsx
// Update transform constants
auto ptr = RSX(ctx)->fifo_ctrl->translate_address(barrier.arg0);
auto buffer = std::span<const u32>(static_cast<const u32*>(vm::base(ptr)), barrier.arg1);
auto notify = [&](rsx::context*, u32 load, u32 count)
{
if (!instance_config)
{
return false;
}
instance_config->transform_constants_data_changed = true;
instance_config->patch_load_offset = load;
instance_config->patch_load_count = count;
return true;
};
nv4097::set_transform_constant::batch_decode(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, buffer, notify);
nv4097::set_transform_constant::batch_decode(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, buffer);
result |= transform_constants_changed;
break;
}

View File

@ -7,14 +7,6 @@
namespace rsx
{
struct instanced_draw_config_t
{
bool transform_constants_data_changed;
u32 patch_load_offset;
u32 patch_load_count;
};
class draw_clause
{
// Stores the first and count argument from draw/draw indexed parameters between begin/end clauses.
@ -59,8 +51,6 @@ namespace rsx
}
}
bool check_trivially_instanced() const;
public:
primitive_type primitive{};
draw_command command{};
@ -69,7 +59,6 @@ namespace rsx
bool is_disjoint_primitive{}; // Set if primitive type does not rely on adjacency information
bool primitive_barrier_enable{}; // Set once to signal that a primitive restart barrier can be inserted
bool is_rendering{}; // Set while we're actually pushing the draw calls to host GPU
bool is_trivial_instanced_draw{}; // Set if the draw call can be executed on the host GPU as a single instanced draw.
simple_array<u32> inline_vertex_array{};
@ -84,8 +73,8 @@ namespace rsx
{
// End draw call append mode
current_range_index = ~0u;
// Check if we can instance on host
is_trivial_instanced_draw = check_trivially_instanced();
// TODO
}
/**
@ -280,7 +269,7 @@ namespace rsx
/**
* Executes commands reqiured to make the current draw state valid
*/
u32 execute_pipeline_dependencies(struct context* ctx, instanced_draw_config_t* instance_config = nullptr) const;
u32 execute_pipeline_dependencies(struct context* ctx) const;
const draw_range_t& get_range() const
{

View File

@ -18,7 +18,7 @@ namespace rsx
// NOTE: Push buffers still behave like register writes.
// You do not need to specify each attribute for each vertex, the register is referenced instead.
// This is classic OpenGL 1.x behavior as I remember.
RSX(ctx)->GRAPH_frontend().append_to_push_buffer(attrib_index, count, channel_select, vtype, value);
RSX(ctx)->append_to_push_buffer(attrib_index, count, channel_select, vtype, value);
}
auto& info = REGS(ctx)->register_vertex_info[attrib_index];

View File

@ -30,7 +30,7 @@ namespace rsx
REGS(ctx)->transform_constants[load + constant_id][subreg] = arg;
}
void set_transform_constant::batch_decode(context* ctx, u32 reg, const std::span<const u32>& args, const std::function<bool(context*, u32, u32)>& notify)
void set_transform_constant::batch_decode(context* ctx, u32 reg, const std::span<const u32>& args)
{
const u32 index = reg - NV4097_SET_TRANSFORM_CONSTANT;
const u32 constant_id = index / 4;
@ -40,15 +40,8 @@ namespace rsx
auto dst = &REGS(ctx)->transform_constants[load + constant_id][subreg];
copy_data_swap_u32(dst, args.data(), ::size32(args));
// Notify
const u32 last_constant_id = ((reg + ::size32(args) + 3) - NV4097_SET_TRANSFORM_CONSTANT) / 4; // Aligned div
const u32 load_index = load + constant_id;
const u32 load_count = last_constant_id - constant_id;
if (!notify || !notify(ctx, load_index, load_count))
{
RSX(ctx)->patch_transform_constants(ctx, load_index, load_count);
}
RSX(ctx)->patch_transform_constants(ctx, load + constant_id, last_constant_id - constant_id);
}
void set_transform_constant::impl(context* ctx, u32 reg, [[maybe_unused]] u32 arg)
@ -263,15 +256,15 @@ namespace rsx
{
if (RSX(ctx)->in_begin_end)
{
RSX(ctx)->GRAPH_frontend().append_array_element(arg & 0xFFFF);
RSX(ctx)->GRAPH_frontend().append_array_element(arg >> 16);
RSX(ctx)->append_array_element(arg & 0xFFFF);
RSX(ctx)->append_array_element(arg >> 16);
}
}
void set_array_element32(context* ctx, u32, u32 arg)
{
if (RSX(ctx)->in_begin_end)
RSX(ctx)->GRAPH_frontend().append_array_element(arg);
RSX(ctx)->append_array_element(arg);
}
void draw_arrays(context* /*rsx*/, u32 /*reg*/, u32 arg)
@ -360,8 +353,8 @@ namespace rsx
// Check if we have immediate mode vertex data in a driver-local buffer
if (REGS(ctx)->current_draw_clause.command == rsx::draw_command::none)
{
const u32 push_buffer_vertices_count = RSX(ctx)->GRAPH_frontend().get_push_buffer_vertex_count();
const u32 push_buffer_index_count = RSX(ctx)->GRAPH_frontend().get_push_buffer_index_count();
const u32 push_buffer_vertices_count = RSX(ctx)->get_push_buffer_vertex_count();
const u32 push_buffer_index_count = RSX(ctx)->get_push_buffer_index_count();
// Need to set this flag since it overrides some register contents
REGS(ctx)->current_draw_clause.is_immediate_draw = true;
@ -393,12 +386,6 @@ namespace rsx
return;
}
// Notify the backend if the drawing style changes (instanced vs non-instanced)
if (REGS(ctx)->current_draw_clause.is_trivial_instanced_draw != RSX(ctx)->is_current_vertex_program_instanced())
{
RSX(ctx)->m_graphics_state |= rsx::pipeline_state::xform_instancing_state_dirty;
}
RSX(ctx)->end();
}
else

View File

@ -204,7 +204,7 @@ namespace rsx
static void decode_one(context* ctx, u32 reg, u32 arg);
static void batch_decode(context* ctx, u32 reg, const std::span<const u32>& args, const std::function<bool(context*, u32, u32)>& notify = {});
static void batch_decode(context* ctx, u32 reg, const std::span<const u32>& args);
};
struct set_transform_program

View File

@ -269,7 +269,6 @@ public:
struct
{
// Configuration properties (in)
u16 in_register_mask = 0;
u16 common_access_sampler_mask = 0;
@ -277,7 +276,6 @@ public:
u16 redirected_sampler_mask = 0;
u16 multisampled_sampler_mask = 0;
// Decoded properties (out)
bool has_lit_op = false;
bool has_gather_op = false;
bool has_no_output = false;

View File

@ -262,11 +262,6 @@ namespace glsl
}
}
if (props.require_instanced_render)
{
enabled_options.push_back("_ENABLE_INSTANCED_CONSTANTS");
}
// Import vertex header
program_common::define_glsl_switches(OS, enabled_options);

View File

@ -55,23 +55,4 @@ vec4 apply_zclip_xform(
}
#endif
#if defined(_ENABLE_INSTANCED_CONSTANTS)
// Workaround for GL vs VK builtin variable naming
#ifdef VULKAN
#define _gl_InstanceID gl_InstanceIndex
#else
#define _gl_InstanceID gl_InstanceID
#endif
vec4 _fetch_constant(const in int base_offset)
{
// Get virtual draw/instance id. Normally will be 1:1 based on instance index
const int indirection_offset = (_gl_InstanceID * CONSTANTS_ARRAY_LENGTH) + base_offset;
const int corrected_offset = constants_addressing_lookup[indirection_offset];
return instanced_constants_array[corrected_offset];
}
#else
#define _fetch_constant(x) vc[x]
#endif
)"

View File

@ -22,7 +22,6 @@ namespace glsl
// Applicable in vertex stage
bool require_lit_emulation : 1;
bool require_explicit_invariance : 1;
bool require_instanced_render : 1;
bool emulate_zclip_transform : 1;
bool emulate_depth_clip_only : 1;

View File

@ -341,7 +341,6 @@ vertex_program_utils::vertex_program_metadata vertex_program_utils::analyse_vert
usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) const
{
usz hash = vertex_program_utils::get_vertex_program_ucode_hash(program);
hash ^= program.ctrl;
hash ^= program.output_mask;
hash ^= program.texture_state.texture_dimensions;
hash ^= program.texture_state.multisampled_textures;
@ -352,8 +351,6 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
{
if (binary1.output_mask != binary2.output_mask)
return false;
if (binary1.ctrl != binary2.ctrl)
return false;
if (binary1.texture_state != binary2.texture_state)
return false;
if (binary1.data.size() != binary2.data.size())

View File

@ -131,7 +131,7 @@ std::string VertexProgramDecompiler::GetSRC(const u32 n)
m_parr.AddParam(PF_PARAM_UNIFORM, float4, std::string("vc[468]"));
properties.has_indexed_constants |= !!d3.index_const;
m_constant_ids.insert(static_cast<u16>(d1.const_src));
fmt::append(ret, "_fetch_constant(%u%s)", d1.const_src, (d3.index_const ? " + " + AddAddrReg() : ""));
ret += std::string("vc[") + std::to_string(d1.const_src) + (d3.index_const ? " + " + AddAddrReg() : "") + "]";
break;
default:
@ -362,13 +362,14 @@ std::string VertexProgramDecompiler::NotZeroPositive(const std::string& code)
std::string VertexProgramDecompiler::BuildCode()
{
std::string main_body;
for (int i = 0, lvl = 1; i < static_cast<int>(m_instr_count); i++)
for (uint i = 0, lvl = 1; i < m_instr_count; i++)
{
lvl = std::max<int>(lvl - m_instructions[i].close_scopes, 0);
lvl -= m_instructions[i].close_scopes;
if (lvl < 1) lvl = 1;
for (int j = 0; j < m_instructions[i].put_close_scopes; ++j)
{
if (lvl > 1) --lvl;
--lvl;
if (lvl < 1) lvl = 1;
main_body.append(lvl, '\t') += "}\n";
}
@ -379,8 +380,6 @@ std::string VertexProgramDecompiler::BuildCode()
lvl++;
}
ensure(lvl >= 0); // Underflow of indent level will cause crashes!!
for (const auto& instruction_body : m_instructions[i].body)
{
main_body.append(lvl, '\t') += instruction_body + "\n";
@ -410,7 +409,7 @@ std::string VertexProgramDecompiler::BuildCode()
{
const auto i = offset++;
if (i == index) continue; // Replace with self
reloc_table.emplace_back(fmt::format("_fetch_constant(%d)", index), fmt::format("_fetch_constant(%d)", i));
reloc_table.emplace_back(fmt::format("vc[%d]", index), fmt::format("vc[%d]", i));
}
// One-time patch

View File

@ -132,10 +132,6 @@ protected:
public:
struct
{
// Configuration properties (in)
// None
// Decoded properties (out)
bool has_lit_op = false;
bool has_indexed_constants = false;
}

View File

@ -409,13 +409,12 @@ namespace rsx
}
};
const auto element_push_buffer = render->draw_processor()->element_push_buffer();
if (index_size == 4)
{
if (!element_push_buffer.empty()) [[unlikely]]
if (!render->element_push_buffer.empty()) [[unlikely]]
{
// Indices provided via immediate mode
re_evaluate(reinterpret_cast<const std::byte*>(element_push_buffer.data()), u32{});
re_evaluate(reinterpret_cast<const std::byte*>(render->element_push_buffer.data()), u32{});
}
else
{
@ -425,10 +424,10 @@ namespace rsx
}
else
{
if (!element_push_buffer.empty()) [[unlikely]]
if (!render->element_push_buffer.empty()) [[unlikely]]
{
// Indices provided via immediate mode
re_evaluate(reinterpret_cast<const std::byte*>(element_push_buffer.data()), u16{});
re_evaluate(reinterpret_cast<const std::byte*>(render->element_push_buffer.data()), u16{});
}
else
{
@ -620,12 +619,12 @@ namespace rsx
ar(rsx::method_registers);
for (auto& v : m_draw_processor.m_vertex_push_buffers)
for (auto& v : vertex_push_buffers)
{
ar(v.attr, v.size, v.type, v.vertex_count, v.dword_count, v.data);
}
ar(m_draw_processor.m_element_push_buffer, fifo_ret_addr, saved_fifo_ret, zcull_surface_active, m_surface_info, m_depth_surface_info, m_framebuffer_layout);
ar(element_push_buffer, fifo_ret_addr, saved_fifo_ret, zcull_surface_active, m_surface_info, m_depth_surface_info, m_framebuffer_layout);
ar(dma_address, iomap_table, restore_point, tiles, zculls, display_buffers, display_buffers_count, current_display_buffer);
ar(enable_second_vhandler, requested_vsync);
ar(device_addr, label_addr, main_mem_size, local_mem_size, rsx_event_port, driver_info);
@ -697,8 +696,6 @@ namespace rsx
s_ctx.rsxthr = this;
m_ctx = &s_ctx;
m_draw_processor.init(m_ctx);
if (g_cfg.misc.use_native_interface && (g_cfg.video.renderer == video_renderer::opengl || g_cfg.video.renderer == video_renderer::vulkan))
{
m_overlay_manager = g_fxo->init<rsx::overlays::display_manager>(0);
@ -804,6 +801,39 @@ namespace rsx
in_begin_end = true;
}
void thread::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value)
{
if (!(rsx::method_registers.vertex_attrib_input_mask() & (1 << attribute)))
{
return;
}
// Enforce ATTR0 as vertex attribute for push buffers.
// This whole thing becomes a mess if we don't have a provoking attribute.
const auto vertex_id = vertex_push_buffers[0].get_vertex_id();
vertex_push_buffers[attribute].set_vertex_data(attribute, vertex_id, subreg_index, type, size, value);
m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty;
}
u32 thread::get_push_buffer_vertex_count() const
{
// Enforce ATTR0 as vertex attribute for push buffers.
// This whole thing becomes a mess if we don't have a provoking attribute.
return vertex_push_buffers[0].vertex_count;
}
void thread::append_array_element(u32 index)
{
// Endianness is swapped because common upload code expects input in BE
// TODO: Implement fast upload path for LE inputs and do away with this
element_push_buffer.push_back(std::bit_cast<u32, be_t<u32>>(index));
}
u32 thread::get_push_buffer_index_count() const
{
return ::size32(element_push_buffer);
}
void thread::end()
{
if (capture_current_frame)
@ -820,7 +850,20 @@ namespace rsx
m_eng_interrupt_mask |= rsx::backend_interrupt;
ROP_sync_timestamp = rsx::get_shared_tag();
m_draw_processor.clear_push_buffers();
if (m_graphics_state & rsx::pipeline_state::push_buffer_arrays_dirty)
{
for (auto& push_buf : vertex_push_buffers)
{
//Disabled, see https://github.com/RPCS3/rpcs3/issues/1932
//rsx::method_registers.register_vertex_info[index].size = 0;
push_buf.clear();
}
m_graphics_state.clear(rsx::pipeline_state::push_buffer_arrays_dirty);
}
element_push_buffer.clear();
zcull_ctrl->on_draw();
@ -1154,6 +1197,180 @@ namespace rsx
state += cpu_flag::exit;
}
void thread::fill_scale_offset_data(void *buffer, bool flip_y) const
{
int clip_w = rsx::method_registers.surface_clip_width();
int clip_h = rsx::method_registers.surface_clip_height();
float scale_x = rsx::method_registers.viewport_scale_x() / (clip_w / 2.f);
float offset_x = rsx::method_registers.viewport_offset_x() - (clip_w / 2.f);
offset_x /= clip_w / 2.f;
float scale_y = rsx::method_registers.viewport_scale_y() / (clip_h / 2.f);
float offset_y = (rsx::method_registers.viewport_offset_y() - (clip_h / 2.f));
offset_y /= clip_h / 2.f;
if (flip_y) scale_y *= -1;
if (flip_y) offset_y *= -1;
float scale_z = rsx::method_registers.viewport_scale_z();
float offset_z = rsx::method_registers.viewport_offset_z();
float one = 1.f;
utils::stream_vector(buffer, std::bit_cast<u32>(scale_x), 0, 0, std::bit_cast<u32>(offset_x));
utils::stream_vector(static_cast<char*>(buffer) + 16, 0, std::bit_cast<u32>(scale_y), 0, std::bit_cast<u32>(offset_y));
utils::stream_vector(static_cast<char*>(buffer) + 32, 0, 0, std::bit_cast<u32>(scale_z), std::bit_cast<u32>(offset_z));
utils::stream_vector(static_cast<char*>(buffer) + 48, 0, 0, 0, std::bit_cast<u32>(one));
}
void thread::fill_user_clip_data(void *buffer) const
{
const rsx::user_clip_plane_op clip_plane_control[6] =
{
rsx::method_registers.clip_plane_0_enabled(),
rsx::method_registers.clip_plane_1_enabled(),
rsx::method_registers.clip_plane_2_enabled(),
rsx::method_registers.clip_plane_3_enabled(),
rsx::method_registers.clip_plane_4_enabled(),
rsx::method_registers.clip_plane_5_enabled(),
};
u8 data_block[64];
s32* clip_enabled_flags = reinterpret_cast<s32*>(data_block);
f32* clip_distance_factors = reinterpret_cast<f32*>(data_block + 32);
for (int index = 0; index < 6; ++index)
{
switch (clip_plane_control[index])
{
default:
rsx_log.error("bad clip plane control (0x%x)", static_cast<u8>(clip_plane_control[index]));
[[fallthrough]];
case rsx::user_clip_plane_op::disable:
clip_enabled_flags[index] = 0;
clip_distance_factors[index] = 0.f;
break;
case rsx::user_clip_plane_op::greater_or_equal:
clip_enabled_flags[index] = 1;
clip_distance_factors[index] = 1.f;
break;
case rsx::user_clip_plane_op::less_than:
clip_enabled_flags[index] = 1;
clip_distance_factors[index] = -1.f;
break;
}
}
memcpy(buffer, data_block, 2 * 8 * sizeof(u32));
}
/**
* Fill buffer with vertex program constants.
* Buffer must be at least 512 float4 wide.
*/
void thread::fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table)
{
if (!reloc_table.empty()) [[ likely ]]
{
char* dst = reinterpret_cast<char*>(buffer);
for (const auto& index : reloc_table)
{
utils::stream_vector_from_memory(dst, &rsx::method_registers.transform_constants[index]);
dst += 16;
}
}
else
{
memcpy(buffer, rsx::method_registers.transform_constants.data(), 468 * 4 * sizeof(float));
}
}
void thread::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/)
{
ROP_control_t rop_control{};
if (rsx::method_registers.alpha_test_enabled())
{
const u32 alpha_func = static_cast<u32>(rsx::method_registers.alpha_func());
rop_control.set_alpha_test_func(alpha_func);
rop_control.enable_alpha_test();
}
if (rsx::method_registers.polygon_stipple_enabled())
{
rop_control.enable_polygon_stipple();
}
if (rsx::method_registers.msaa_alpha_to_coverage_enabled() && !backend_config.supports_hw_a2c)
{
// TODO: Properly support alpha-to-coverage and alpha-to-one behavior in shaders
// Alpha values generate a coverage mask for order independent blending
// Requires hardware AA to work properly (or just fragment sample stage in fragment shaders)
// Simulated using combined alpha blend and alpha test
rop_control.enable_alpha_to_coverage();
if (rsx::method_registers.msaa_sample_mask())
{
rop_control.enable_MSAA_writes();
}
// Sample configuration bits
switch (rsx::method_registers.surface_antialias())
{
case rsx::surface_antialiasing::center_1_sample:
break;
case rsx::surface_antialiasing::diagonal_centered_2_samples:
rop_control.set_msaa_control(1u);
break;
default:
rop_control.set_msaa_control(3u);
break;
}
}
const f32 fog0 = rsx::method_registers.fog_params_0();
const f32 fog1 = rsx::method_registers.fog_params_1();
const u32 fog_mode = static_cast<u32>(rsx::method_registers.fog_equation());
// Check if framebuffer is actually an XRGB format and not a WZYX format
switch (rsx::method_registers.surface_color())
{
case rsx::surface_color_format::w16z16y16x16:
case rsx::surface_color_format::w32z32y32x32:
case rsx::surface_color_format::x32:
// These behave very differently from "normal" formats.
break;
default:
// Integer framebuffer formats.
rop_control.enable_framebuffer_INT();
// Check if we want sRGB conversion.
if (rsx::method_registers.framebuffer_srgb_enabled())
{
rop_control.enable_framebuffer_sRGB();
}
break;
}
// Generate wpos coefficients
// wpos equation is now as follows:
// wpos.y = (frag_coord / resolution_scale) * ((window_origin!=top)?-1.: 1.) + ((window_origin!=top)? window_height : 0)
// wpos.x = (frag_coord / resolution_scale)
// wpos.zw = frag_coord.zw
const auto window_origin = rsx::method_registers.shader_window_origin();
const u32 window_height = rsx::method_registers.shader_window_height();
const f32 resolution_scale = (window_height <= static_cast<u32>(g_cfg.video.min_scalable_dimension)) ? 1.f : rsx::get_resolution_scale();
const f32 wpos_scale = (window_origin == rsx::window_origin::top) ? (1.f / resolution_scale) : (-1.f / resolution_scale);
const f32 wpos_bias = (window_origin == rsx::window_origin::top) ? 0.f : window_height;
const f32 alpha_ref = rsx::method_registers.alpha_ref();
u32 *dst = static_cast<u32*>(buffer);
utils::stream_vector(dst, std::bit_cast<u32>(fog0), std::bit_cast<u32>(fog1), rop_control.value, std::bit_cast<u32>(alpha_ref));
utils::stream_vector(dst + 4, 0u, fog_mode, std::bit_cast<u32>(wpos_scale), std::bit_cast<u32>(wpos_bias));
}
u64 thread::timestamp()
{
const u64 freq = sys_time_get_timebase_frequency();
@ -1192,6 +1409,51 @@ namespace rsx
return t + timestamp_subvalue;
}
std::span<const std::byte> thread::get_raw_index_array(const draw_clause& draw_indexed_clause) const
{
if (!element_push_buffer.empty()) [[ unlikely ]]
{
// Indices provided via immediate mode
return {reinterpret_cast<const std::byte*>(element_push_buffer.data()), ::narrow<u32>(element_push_buffer.size() * sizeof(u32))};
}
const rsx::index_array_type type = rsx::method_registers.index_type();
const u32 type_size = get_index_type_size(type);
// Force aligned indices as realhw
const u32 address = (0 - type_size) & get_address(rsx::method_registers.index_array_address(), rsx::method_registers.index_array_location());
const u32 first = draw_indexed_clause.min_index();
const u32 count = draw_indexed_clause.get_elements_count();
const auto ptr = vm::_ptr<const std::byte>(address);
return { ptr + first * type_size, count * type_size };
}
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
thread::get_draw_command(const rsx::rsx_state& state) const
{
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]]
{
return draw_indexed_array_command
{
get_raw_index_array(state.current_draw_clause)
};
}
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::array)
{
return draw_array_command{};
}
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
return draw_inlined_array{};
}
fmt::throw_exception("ill-formed draw command");
}
void thread::do_local_task(FIFO::state state)
{
m_eng_interrupt_mask.clear(rsx::backend_interrupt);
@ -1987,17 +2249,6 @@ namespace rsx
void thread::get_current_vertex_program(const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, rsx::limits::vertex_textures_count>& sampler_descriptors)
{
if (m_graphics_state.test(rsx::pipeline_state::xform_instancing_state_dirty))
{
current_vertex_program.ctrl = 0;
if (rsx::method_registers.current_draw_clause.is_trivial_instanced_draw)
{
current_vertex_program.ctrl |= RSX_SHADER_CONTROL_INSTANCED_CONSTANTS;
}
m_graphics_state.clear(rsx::pipeline_state::xform_instancing_state_dirty);
}
if (!m_graphics_state.test(rsx::pipeline_state::vertex_program_dirty))
{
return;
@ -2005,6 +2256,7 @@ namespace rsx
ensure(!m_graphics_state.test(rsx::pipeline_state::vertex_program_ucode_dirty));
current_vertex_program.output_mask = rsx::method_registers.vertex_attrib_output_mask();
current_vertex_program.ctrl = 0; // Reserved
for (u32 textures_ref = current_vp_metadata.referenced_textures_mask, i = 0; textures_ref; textures_ref >>= 1, ++i)
{
@ -2027,6 +2279,183 @@ namespace rsx
current_vertex_program.texture_state.import(current_vp_texture_state, current_vp_metadata.referenced_textures_mask);
}
void thread::analyse_inputs_interleaved(vertex_input_layout& result)
{
const rsx_state& state = rsx::method_registers;
const u32 input_mask = state.vertex_attrib_input_mask() & current_vp_metadata.referenced_inputs_mask;
result.clear();
result.attribute_mask = static_cast<u16>(input_mask);
if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
{
interleaved_range_info& info = *result.alloc_interleaved_block();
info.interleaved = true;
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
auto &vinfo = state.vertex_arrays_info[index];
result.attribute_placement[index] = attribute_buffer_placement::none;
if (vinfo.size() > 0)
{
// Stride must be updated even if the stream is disabled
info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size());
info.locations.push_back({ index, false, 1 });
if (input_mask & (1u << index))
{
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
}
else if (state.register_vertex_info[index].size > 0 && input_mask & (1u << index))
{
// Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
}
if (info.attribute_stride)
{
// At least one array feed must be enabled for vertex input
result.interleaved_blocks.push_back(&info);
}
return;
}
const u32 frequency_divider_mask = rsx::method_registers.frequency_divider_operation_mask();
result.interleaved_blocks.reserve(16);
result.referenced_registers.reserve(16);
for (auto [ref_mask, index] = std::tuple{ input_mask, u8(0) }; ref_mask; ++index, ref_mask >>= 1)
{
ensure(index < rsx::limits::vertex_count);
if (!(ref_mask & 1u))
{
// Nothing to do, uninitialized
continue;
}
// Always reset attribute placement by default
result.attribute_placement[index] = attribute_buffer_placement::none;
// Check for interleaving
if (rsx::method_registers.current_draw_clause.is_immediate_draw &&
rsx::method_registers.current_draw_clause.command != rsx::draw_command::indexed)
{
// NOTE: In immediate rendering mode, all vertex setup is ignored
// Observed with GT5, immediate render bypasses array pointers completely, even falling back to fixed-function register defaults
if (vertex_push_buffers[index].vertex_count > 1)
{
// Ensure consistent number of vertices per attribute.
vertex_push_buffers[index].pad_to(vertex_push_buffers[0].vertex_count, false);
// Read temp buffer (register array)
std::pair<u8, u32> volatile_range_info = std::make_pair(index, static_cast<u32>(vertex_push_buffers[index].data.size() * sizeof(u32)));
result.volatile_blocks.push_back(volatile_range_info);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
else if (state.register_vertex_info[index].size > 0)
{
// Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
// Fall back to the default register value if no source is specified via register
continue;
}
const auto& info = state.vertex_arrays_info[index];
if (!info.size())
{
if (state.register_vertex_info[index].size > 0)
{
//Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
continue;
}
}
else
{
result.attribute_placement[index] = attribute_buffer_placement::persistent;
const u32 base_address = info.offset() & 0x7fffffff;
bool alloc_new_block = true;
bool modulo = !!(frequency_divider_mask & (1 << index));
for (auto &block : result.interleaved_blocks)
{
if (block->single_vertex)
{
//Single vertex definition, continue
continue;
}
if (block->attribute_stride != info.stride())
{
//Stride does not match, continue
continue;
}
if (base_address > block->base_offset)
{
const u32 diff = base_address - block->base_offset;
if (diff > info.stride())
{
//Not interleaved, continue
continue;
}
}
else
{
const u32 diff = block->base_offset - base_address;
if (diff > info.stride())
{
//Not interleaved, continue
continue;
}
//Matches, and this address is lower than existing
block->base_offset = base_address;
}
alloc_new_block = false;
block->locations.push_back({ index, modulo, info.frequency() });
block->interleaved = true;
break;
}
if (alloc_new_block)
{
interleaved_range_info& block = *result.alloc_interleaved_block();
block.base_offset = base_address;
block.attribute_stride = info.stride();
block.memory_location = info.offset() >> 31;
block.locations.reserve(16);
block.locations.push_back({ index, modulo, info.frequency() });
if (block.attribute_stride == 0)
{
block.single_vertex = true;
block.attribute_stride = rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
result.interleaved_blocks.push_back(&block);
}
}
}
for (auto &info : result.interleaved_blocks)
{
//Calculate real data address to be used during upload
info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info->base_offset), info->memory_location);
}
}
void thread::get_current_fragment_program(const std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, rsx::limits::fragment_textures_count>& sampler_descriptors)
{
if (!m_graphics_state.test(rsx::pipeline_state::fragment_program_dirty))
@ -2347,6 +2776,267 @@ namespace rsx
return std::make_pair(persistent_memory_size, volatile_memory_size);
}
void thread::fill_vertex_layout_state(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, s32* buffer, u32 persistent_offset_base, u32 volatile_offset_base)
{
std::array<s32, 16> offset_in_block = {};
u32 volatile_offset = volatile_offset_base;
u32 persistent_offset = persistent_offset_base;
//NOTE: Order is important! Transient ayout is always push_buffers followed by register data
if (rsx::method_registers.current_draw_clause.is_immediate_draw)
{
for (const auto &info : layout.volatile_blocks)
{
offset_in_block[info.first] = volatile_offset;
volatile_offset += info.second;
}
}
for (u8 index : layout.referenced_registers)
{
offset_in_block[index] = volatile_offset;
volatile_offset += 16;
}
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
const auto &block = layout.interleaved_blocks[0];
u32 inline_data_offset = volatile_offset;
for (const auto& attrib : block->locations)
{
auto &info = rsx::method_registers.vertex_arrays_info[attrib.index];
offset_in_block[attrib.index] = inline_data_offset;
inline_data_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
}
else
{
for (const auto &block : layout.interleaved_blocks)
{
for (const auto& attrib : block->locations)
{
const u32 local_address = (rsx::method_registers.vertex_arrays_info[attrib.index].offset() & 0x7fffffff);
offset_in_block[attrib.index] = persistent_offset + (local_address - block->base_offset);
}
const auto range = block->calculate_required_range(first_vertex, vertex_count);
persistent_offset += block->attribute_stride * range.second;
}
}
// Fill the data
// Each descriptor field is 64 bits wide
// [0-8] attribute stride
// [8-24] attribute divisor
// [24-27] attribute type
// [27-30] attribute size
// [30-31] reserved
// [31-60] starting offset
// [60-21] swap bytes flag
// [61-22] volatile flag
// [62-63] modulo enable flag
const s32 default_frequency_mask = (1 << 8);
const s32 swap_storage_mask = (1 << 29);
const s32 volatile_storage_mask = (1 << 30);
const s32 modulo_op_frequency_mask = smin;
const u32 modulo_mask = rsx::method_registers.frequency_divider_operation_mask();
const auto max_index = (first_vertex + vertex_count) - 1;
for (u16 ref_mask = current_vp_metadata.referenced_inputs_mask, index = 0; ref_mask; ++index, ref_mask >>= 1)
{
if (!(ref_mask & 1u))
{
// Unused input, ignore this
continue;
}
if (layout.attribute_placement[index] == attribute_buffer_placement::none)
{
static constexpr u64 zero = 0;
std::memcpy(buffer + index * 2, &zero, sizeof(zero));
continue;
}
rsx::vertex_base_type type = {};
s32 size = 0;
s32 attrib0 = 0;
s32 attrib1 = 0;
if (layout.attribute_placement[index] == attribute_buffer_placement::transient)
{
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
const auto &info = rsx::method_registers.vertex_arrays_info[index];
if (!info.size())
{
// Register
const auto& reginfo = rsx::method_registers.register_vertex_info[index];
type = reginfo.type;
size = reginfo.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size);
}
else
{
// Array
type = info.type();
size = info.size();
attrib0 = layout.interleaved_blocks[0]->attribute_stride | default_frequency_mask;
}
}
else
{
// Data is either from an immediate render or register input
// Immediate data overrides register input
if (rsx::method_registers.current_draw_clause.is_immediate_draw &&
vertex_push_buffers[index].vertex_count > 1)
{
// Push buffer
const auto &info = vertex_push_buffers[index];
type = info.type;
size = info.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size) | default_frequency_mask;
}
else
{
// Register
const auto& info = rsx::method_registers.register_vertex_info[index];
type = info.type;
size = info.size;
attrib0 = rsx::get_vertex_type_size_on_host(type, size);
}
}
attrib1 |= volatile_storage_mask;
}
else
{
auto &info = rsx::method_registers.vertex_arrays_info[index];
type = info.type();
size = info.size();
auto stride = info.stride();
attrib0 = stride;
if (stride > 0) //when stride is 0, input is not an array but a single element
{
const u32 frequency = info.frequency();
switch (frequency)
{
case 0:
case 1:
{
attrib0 |= default_frequency_mask;
break;
}
default:
{
if (modulo_mask & (1 << index))
{
if (max_index >= frequency)
{
// Only set modulo mask if a modulo op is actually necessary!
// This requires that the uploaded range for this attr = [0, freq-1]
// Ignoring modulo op if the rendered range does not wrap allows for range optimization
attrib0 |= (frequency << 8);
attrib1 |= modulo_op_frequency_mask;
}
else
{
attrib0 |= default_frequency_mask;
}
}
else
{
// Division
attrib0 |= (frequency << 8);
}
break;
}
}
}
} //end attribute placement check
// Special compressed 4 components into one 4-byte value. Decoded as one value.
if (type == rsx::vertex_base_type::cmp)
{
size = 1;
}
// All data is passed in in PS3-native order (BE) so swap flag should be set
attrib1 |= swap_storage_mask;
attrib0 |= (static_cast<s32>(type) << 24);
attrib0 |= (size << 27);
attrib1 |= offset_in_block[index];
buffer[index * 2 + 0] = attrib0;
buffer[index * 2 + 1] = attrib1;
}
}
void thread::write_vertex_data_to_memory(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, void *persistent_data, void *volatile_data)
{
auto transient = static_cast<char*>(volatile_data);
auto persistent = static_cast<char*>(persistent_data);
auto &draw_call = rsx::method_registers.current_draw_clause;
if (transient != nullptr)
{
if (draw_call.command == rsx::draw_command::inlined_array)
{
for (const u8 index : layout.referenced_registers)
{
memcpy(transient, rsx::method_registers.register_vertex_info[index].data.data(), 16);
transient += 16;
}
memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32));
//Is it possible to reference data outside of the inlined array?
return;
}
//NOTE: Order is important! Transient layout is always push_buffers followed by register data
if (draw_call.is_immediate_draw)
{
//NOTE: It is possible for immediate draw to only contain index data, so vertex data can be in persistent memory
for (const auto &info : layout.volatile_blocks)
{
memcpy(transient, vertex_push_buffers[info.first].data.data(), info.second);
transient += info.second;
}
}
for (const u8 index : layout.referenced_registers)
{
memcpy(transient, rsx::method_registers.register_vertex_info[index].data.data(), 16);
transient += 16;
}
}
if (persistent != nullptr)
{
for (interleaved_range_info* block : layout.interleaved_blocks)
{
auto range = block->calculate_required_range(first_vertex, vertex_count);
const u32 data_size = range.second * block->attribute_stride;
const u32 vertex_base = range.first * block->attribute_stride;
g_fxo->get<rsx::dma_manager>().copy(persistent, vm::_ptr<char>(block->real_offset_address) + vertex_base, data_size);
persistent += data_size;
}
}
}
void thread::flip(const display_flip_info_t& info)
{
m_eng_interrupt_mask.clear(rsx::display_interrupt);
@ -3006,7 +3696,7 @@ namespace rsx
u32 thread::get_load()
{
// Average load over around 30 frames
//Average load over around 30 frames
if (!performance_counters.last_update_timestamp || performance_counters.sampled_frames > 30)
{
const auto timestamp = get_system_time();

View File

@ -28,8 +28,6 @@
#include "Emu/IdManager.h"
#include "Core/RSXDisplay.h"
#include "Core/RSXDrawCommands.h"
#include "Core/RSXDriverState.h"
#include "Core/RSXFrameBuffer.h"
#include "Core/RSXContext.h"
#include "Core/RSXIOMap.hpp"
@ -61,6 +59,52 @@ namespace rsx
context_clear_all = context_clear_color | context_clear_depth
};
enum pipeline_state : u32
{
fragment_program_ucode_dirty = (1 << 0), // Fragment program ucode changed
vertex_program_ucode_dirty = (1 << 1), // Vertex program ucode changed
fragment_program_state_dirty = (1 << 2), // Fragment program state changed
vertex_program_state_dirty = (1 << 3), // Vertex program state changed
fragment_state_dirty = (1 << 4), // Fragment state changed (alpha test, etc)
vertex_state_dirty = (1 << 5), // Vertex state changed (scale_offset, clip planes, etc)
transform_constants_dirty = (1 << 6), // Transform constants changed
fragment_constants_dirty = (1 << 7), // Fragment constants changed
framebuffer_reads_dirty = (1 << 8), // Framebuffer contents changed
fragment_texture_state_dirty = (1 << 9), // Fragment texture parameters changed
vertex_texture_state_dirty = (1 << 10), // Fragment texture parameters changed
scissor_config_state_dirty = (1 << 11), // Scissor region changed
zclip_config_state_dirty = (1 << 12), // Viewport Z clip changed
scissor_setup_invalid = (1 << 13), // Scissor configuration is broken
scissor_setup_clipped = (1 << 14), // Scissor region is cropped by viewport constraint
polygon_stipple_pattern_dirty = (1 << 15), // Rasterizer stippling pattern changed
line_stipple_pattern_dirty = (1 << 16), // Line stippling pattern changed
push_buffer_arrays_dirty = (1 << 17), // Push buffers have data written to them (immediate mode vertex buffers)
polygon_offset_state_dirty = (1 << 18), // Polygon offset config was changed
depth_bounds_state_dirty = (1 << 19), // Depth bounds configuration changed
pipeline_config_dirty = (1 << 20), // Generic pipeline configuration changes. Shader peek hint.
rtt_config_dirty = (1 << 21), // Render target configuration changed
rtt_config_contested = (1 << 22), // Render target configuration is indeterminate
rtt_config_valid = (1 << 23), // Render target configuration is valid
rtt_cache_state_dirty = (1 << 24), // Texture cache state is indeterminate
fragment_program_dirty = fragment_program_ucode_dirty | fragment_program_state_dirty,
vertex_program_dirty = vertex_program_ucode_dirty | vertex_program_state_dirty,
invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty,
invalidate_zclip_bits = vertex_state_dirty | zclip_config_state_dirty,
memory_barrier_bits = framebuffer_reads_dirty,
// Vulkan-specific signals
invalidate_vk_dynamic_state = zclip_config_state_dirty | scissor_config_state_dirty | polygon_offset_state_dirty | depth_bounds_state_dirty,
all_dirty = ~0u
};
enum eng_interrupt_reason : u32
{
backend_interrupt = 0x0001, // Backend-related interrupt
@ -117,6 +161,8 @@ namespace rsx
void cpu_task() override;
protected:
std::array<push_buffer_vertex_info, 16> vertex_push_buffers;
s32 m_skip_frame_ctr = 0;
bool skip_current_frame = false;
@ -171,9 +217,6 @@ namespace rsx
// Host DMA
std::unique_ptr<RSXDMAWriter> m_host_dma_ctrl;
// Draw call management
draw_command_processor m_draw_processor;
public:
atomic_t<u64> new_get_put = u64{umax};
u32 restore_point = 0;
@ -182,7 +225,7 @@ namespace rsx
atomic_t<u32> external_interrupt_lock{ 0 };
atomic_t<bool> external_interrupt_ack{ false };
atomic_t<u32> is_initialized{0};
rsx::simple_array<u32> element_push_buffer;
bool is_fifo_idle() const;
void flush_fifo();
@ -225,8 +268,6 @@ namespace rsx
void capture_frame(const std::string& name);
const backend_configuration& get_backend_config() const { return backend_config; }
const draw_command_processor* draw_processor() const { return &m_draw_processor; }
public:
shared_ptr<named_thread<ppu_thread>> intr_thread;
@ -260,6 +301,11 @@ namespace rsx
void get_framebuffer_layout(rsx::framebuffer_creation_context context, framebuffer_layout &layout);
bool get_scissor(areau& region, bool clip_viewport);
/**
* Analyze vertex inputs and group all interleaved blocks
*/
void analyse_inputs_interleaved(vertex_input_layout&);
RSXVertexProgram current_vertex_program = {};
RSXFragmentProgram current_fragment_program = {};
@ -378,6 +424,21 @@ namespace rsx
virtual void sync_hint(FIFO::interrupt_hint hint, reports::sync_hint_payload_t payload);
virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }
std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
get_draw_command(const rsx::rsx_state& state) const;
/**
* Immediate mode rendering requires a temp push buffer to hold attrib values
* Appends a value to the push buffer (currently only supports 32-wide types)
*/
void append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value);
u32 get_push_buffer_vertex_count() const;
void append_array_element(u32 index);
u32 get_push_buffer_index_count() const;
protected:
/**
@ -387,6 +448,17 @@ namespace rsx
*/
std::pair<u32, u32> calculate_memory_requirements(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count);
/**
* Generates vertex input descriptors as an array of 16x4 s32s
*/
void fill_vertex_layout_state(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, s32* buffer, u32 persistent_offset = 0, u32 volatile_offset = 0);
/**
* Uploads vertex data described in the layout descriptor
* Copies from local memory to the write-only output buffers provided in a sequential manner
*/
void write_vertex_data_to_memory(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, void *persistent_data, void *volatile_data);
void evaluate_cpu_usage_reduction_limits();
private:
@ -396,8 +468,29 @@ namespace rsx
void handle_invalidated_memory_range();
public:
/**
* Fill buffer with 4x4 scale offset matrix.
* Vertex shader's position is to be multiplied by this matrix.
* if flip_y is set, the matrix is modified to use d3d convention.
*/
void fill_scale_offset_data(void *buffer, bool flip_y) const;
draw_command_processor& GRAPH_frontend() { return m_draw_processor; }
/**
* Fill buffer with user clip information
*/
void fill_user_clip_data(void *buffer) const;
/**
* Fill buffer with vertex program constants.
* Relocation table allows to do a partial fill with only selected registers.
*/
void fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table);
/**
* Fill buffer with fragment rasterization state.
* Fills current fog values, alpha test parameters and texture scaling parameters
*/
void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program);
/**
* Notify that a section of memory has been mapped
@ -424,17 +517,9 @@ namespace rsx
*/
virtual void on_semaphore_acquire_wait() {}
/**
* Load an image from memory with optional scaling and rotation.
* Returns false to tell the HW decoder to perform the operation on the CPU as a fallback when the operation cannot be safely accelerated.
*/
virtual bool scaled_image_from_memory(const blit_src_info& /*src_info*/, const blit_dst_info& /*dst_info*/, bool /*interpolate*/) { return false; }
// Program public "get" handlers
virtual std::pair<std::string, std::string> get_programs() const { return std::make_pair("", ""); }
bool is_current_vertex_program_instanced() const { return !!(current_vertex_program.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS); }
virtual bool scaled_image_from_memory(const blit_src_info& /*src_info*/, const blit_dst_info& /*dst_info*/, bool /*interpolate*/) { return false; }
public:
void reset();

View File

@ -730,7 +730,7 @@ void VKGSRender::emit_geometry(u32 sub_index)
if (state_flags & rsx::vertex_arrays_changed)
{
m_draw_processor.analyse_inputs_interleaved(m_vertex_layout, current_vp_metadata);
analyse_inputs_interleaved(m_vertex_layout);
}
else if (state_flags & rsx::vertex_base_changed)
{
@ -929,11 +929,7 @@ void VKGSRender::emit_geometry(u32 sub_index)
if (!upload_info.index_info)
{
if (draw_call.is_trivial_instanced_draw)
{
vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, draw_call.pass_count(), 0, 0);
}
else if (draw_call.is_single_draw())
if (draw_call.is_single_draw())
{
vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0);
}
@ -955,13 +951,10 @@ void VKGSRender::emit_geometry(u32 sub_index)
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
if (draw_call.is_trivial_instanced_draw)
if (rsx::method_registers.current_draw_clause.is_single_draw())
{
vkCmdDrawIndexed(*m_current_command_buffer, upload_info.vertex_draw_count, draw_call.pass_count(), 0, 0, 0);
}
else if (rsx::method_registers.current_draw_clause.is_single_draw())
{
vkCmdDrawIndexed(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0, 0);
const u32 index_count = upload_info.vertex_draw_count;
vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0);
}
else
{
@ -1059,10 +1052,7 @@ void VKGSRender::end()
m_frame_stats.setup_time += m_profiler.duration();
// Apply write memory barriers
if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil))
{
ds->write_barrier(*m_current_command_buffer);
}
if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil)) ds->write_barrier(*m_current_command_buffer);
for (auto &rtt : m_rtts.m_bound_render_targets)
{
@ -1121,19 +1111,12 @@ void VKGSRender::end()
m_current_command_buffer->flags |= vk::command_buffer::cb_reload_dynamic_state;
}
auto& draw_call = rsx::method_registers.current_draw_clause;
draw_call.begin();
rsx::method_registers.current_draw_clause.begin();
do
{
emit_geometry(sub_index++);
if (draw_call.is_trivial_instanced_draw)
{
// We already completed. End the draw.
draw_call.end();
}
}
while (draw_call.next());
while (rsx::method_registers.current_draw_clause.next());
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render)
{

View File

@ -477,22 +477,6 @@ namespace
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.instancing_lookup_table_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.instancing_constants_buffer_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
for (auto binding = binding_table.textures_first_bind_slot;
binding < binding_table.vertex_textures_first_bind_slot;
binding++)
@ -659,7 +643,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
{ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , (num_fs_samplers + 4) },
// Conditional rendering predicate slot; refactor to allow skipping this when not needed
{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3 }
{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 }
};
m_descriptor_pool.create(*m_device, descriptor_type_sizes, max_draw_calls);
@ -677,7 +661,6 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
m_raster_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "raster env buffer");
m_instancing_buffer_ring_info.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "instancing data buffer");
const auto shadermode = g_cfg.video.shadermode.get();
@ -966,7 +949,6 @@ VKGSRender::~VKGSRender()
m_vertex_instructions_buffer.destroy();
m_fragment_instructions_buffer.destroy();
m_raster_env_ring_info.destroy();
m_instancing_buffer_ring_info.destroy();
// Fallback bindables
null_buffer.reset();
@ -1304,8 +1286,7 @@ void VKGSRender::check_heap_status(u32 flags)
m_fragment_constants_ring_info.is_critical() ||
m_transform_constants_ring_info.is_critical() ||
m_index_buffer_ring_info.is_critical() ||
m_raster_env_ring_info.is_critical() ||
m_instancing_buffer_ring_info.is_critical();
m_raster_env_ring_info.is_critical();
}
else
{
@ -1337,9 +1318,7 @@ void VKGSRender::check_heap_status(u32 flags)
heap_critical = m_vertex_layout_ring_info.is_critical();
break;
case VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE:
heap_critical = (current_vertex_program.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS)
? m_instancing_buffer_ring_info.is_critical()
: m_transform_constants_ring_info.is_critical();
heap_critical = m_transform_constants_ring_info.is_critical();
break;
case VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE:
heap_critical = m_fragment_constants_ring_info.is_critical();
@ -1382,7 +1361,6 @@ void VKGSRender::check_heap_status(u32 flags)
m_attrib_ring_info.reset_allocation_stats();
m_texture_upload_buffer_ring_info.reset_allocation_stats();
m_raster_env_ring_info.reset_allocation_stats();
m_instancing_buffer_ring_info.reset_allocation_stats();
m_current_frame->reset_heap_ptrs();
m_last_heap_sync_time = rsx::get_shared_tag();
}
@ -2152,7 +2130,6 @@ void VKGSRender::load_program_env()
const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty);
const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program));
const bool update_raster_env = (rsx::method_registers.polygon_stipple_enabled() && !!(m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty));
const bool update_instancing_data = rsx::method_registers.current_draw_clause.is_trivial_instanced_draw;
if (update_vertex_env)
{
@ -2162,8 +2139,8 @@ void VKGSRender::load_program_env()
const auto mem = m_vertex_env_ring_info.alloc<256>(256);
auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148));
m_draw_processor.fill_scale_offset_data(buf, false);
m_draw_processor.fill_user_clip_data(buf + 64);
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<f32*>(buf + 132)) = rsx::method_registers.point_size() * rsx::get_resolution_scale();
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.clip_min();
@ -2173,32 +2150,7 @@ void VKGSRender::load_program_env()
m_vertex_env_buffer_info = { m_vertex_env_ring_info.heap->value, mem, 144 };
}
if (update_instancing_data)
{
// Combines transform load + instancing lookup table
const auto alignment = m_device->gpu().get_limits().minStorageBufferOffsetAlignment;
usz indirection_table_offset = 0;
usz constants_data_table_offset = 0;
rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair<void*, usz>
{
indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment));
return std::make_pair(m_instancing_buffer_ring_info.map(indirection_table_offset, size), size);
});
rsx::io_buffer constants_array_buf([&](usz size) -> std::pair<void*, usz>
{
constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment));
return std::make_pair(m_instancing_buffer_ring_info.map(constants_data_table_offset, size), size);
});
m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, *m_vertex_prog);
m_instancing_buffer_ring_info.unmap();
m_instancing_indirection_buffer_info = { m_instancing_buffer_ring_info.heap->value, indirection_table_offset, indirection_table_buf.size() };
m_instancing_constants_array_buffer_info = { m_instancing_buffer_ring_info.heap->value, constants_data_table_offset, constants_array_buf.size() };
}
else if (update_transform_constants)
if (update_transform_constants)
{
// Transform constants
usz mem_offset = 0;
@ -2248,7 +2200,7 @@ void VKGSRender::load_program_env()
auto mem = m_fragment_env_ring_info.alloc<256>(256);
auto buf = m_fragment_env_ring_info.map(mem, 32);
m_draw_processor.fill_fragment_state_buffer(buf, current_fragment_program);
fill_fragment_state_buffer(buf, current_fragment_program);
m_fragment_env_ring_info.unmap();
m_fragment_env_buffer_info = { m_fragment_env_ring_info.heap->value, mem, 32 };
}
@ -2343,24 +2295,13 @@ void VKGSRender::load_program_env()
m_program->bind_buffer({ predicate, 0, 4 }, binding_table.conditional_render_predicate_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
if (current_vertex_program.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS)
{
m_program->bind_buffer(m_instancing_indirection_buffer_info, binding_table.instancing_lookup_table_bind_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
m_program->bind_buffer(m_instancing_constants_array_buffer_info, binding_table.instancing_constants_buffer_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
// Clear flags
u32 handled_flags = rsx::pipeline_state::fragment_state_dirty |
m_graphics_state.clear(
rsx::pipeline_state::fragment_state_dirty |
rsx::pipeline_state::vertex_state_dirty |
rsx::pipeline_state::transform_constants_dirty |
rsx::pipeline_state::fragment_constants_dirty |
rsx::pipeline_state::fragment_texture_state_dirty;
if (!update_instancing_data)
{
handled_flags |= rsx::pipeline_state::transform_constants_dirty;
}
m_graphics_state.clear(handled_flags);
rsx::pipeline_state::fragment_texture_state_dirty);
}
void VKGSRender::upload_transform_constants(const rsx::io_buffer& buffer)
@ -2376,7 +2317,7 @@ void VKGSRender::upload_transform_constants(const rsx::io_buffer& buffer)
const auto constant_ids = (transform_constants_size == 8192)
? std::span<const u16>{}
: std::span<const u16>(m_vertex_prog->constant_ids);
m_draw_processor.fill_vertex_program_constants_data(buf, constant_ids);
fill_vertex_program_constants_data(buf, constant_ids);
}
}
@ -2419,14 +2360,8 @@ void VKGSRender::update_vertex_env(u32 id, const vk::vertex_upload_info& vertex_
const usz data_offset = (id * 128) + m_vertex_layout_stream_info.offset;
auto dst = m_vertex_layout_ring_info.map(data_offset, 128);
m_draw_processor.fill_vertex_layout_state(
m_vertex_layout,
current_vp_metadata,
vertex_info.first_vertex,
vertex_info.allocated_vertex_count,
static_cast<s32*>(dst),
vertex_info.persistent_window_offset,
vertex_info.volatile_window_offset);
fill_vertex_layout_state(m_vertex_layout, vertex_info.first_vertex, vertex_info.allocated_vertex_count, static_cast<s32*>(dst),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
m_vertex_layout_ring_info.unmap();
}
@ -2547,8 +2482,7 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
m_index_buffer_ring_info.is_dirty() ||
m_transform_constants_ring_info.is_dirty() ||
m_texture_upload_buffer_ring_info.is_dirty() ||
m_raster_env_ring_info.is_dirty() ||
m_instancing_buffer_ring_info.is_dirty())
m_raster_env_ring_info.is_dirty())
{
auto secondary_command_buffer = m_secondary_cb_list.next();
secondary_command_buffer->begin();
@ -2563,7 +2497,6 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
m_transform_constants_ring_info.sync(*secondary_command_buffer);
m_texture_upload_buffer_ring_info.sync(*secondary_command_buffer);
m_raster_env_ring_info.sync(*secondary_command_buffer);
m_instancing_buffer_ring_info.sync(*secondary_command_buffer);
secondary_command_buffer->end();

View File

@ -149,7 +149,6 @@ private:
vk::data_heap m_index_buffer_ring_info; // Index data
vk::data_heap m_texture_upload_buffer_ring_info; // Texture upload heap
vk::data_heap m_raster_env_ring_info; // Raster control such as polygon and line stipple
vk::data_heap m_instancing_buffer_ring_info; // Instanced rendering data (constants indirection table + instanced constants)
vk::data_heap m_fragment_instructions_buffer;
vk::data_heap m_vertex_instructions_buffer;
@ -161,8 +160,6 @@ private:
VkDescriptorBufferInfo m_fragment_constants_buffer_info {};
VkDescriptorBufferInfo m_fragment_texture_params_buffer_info {};
VkDescriptorBufferInfo m_raster_env_buffer_info {};
VkDescriptorBufferInfo m_instancing_indirection_buffer_info {};
VkDescriptorBufferInfo m_instancing_constants_array_buffer_info{};
VkDescriptorBufferInfo m_vertex_instructions_buffer_info {};
VkDescriptorBufferInfo m_fragment_instructions_buffer_info {};

View File

@ -197,7 +197,6 @@ namespace vk
s64 index_heap_ptr = 0;
s64 texture_upload_heap_ptr = 0;
s64 rasterizer_env_heap_ptr = 0;
s64 instancing_heap_ptr = 0;
u64 last_frame_sync_time = 0;
@ -219,7 +218,6 @@ namespace vk
index_heap_ptr = other.index_heap_ptr;
texture_upload_heap_ptr = other.texture_upload_heap_ptr;
rasterizer_env_heap_ptr = other.rasterizer_env_heap_ptr;
instancing_heap_ptr = other.instancing_heap_ptr;
}
// Exchange storage (non-copyable)
@ -231,7 +229,7 @@ namespace vk
void tag_frame_end(
s64 attrib_loc, s64 vtxenv_loc, s64 fragenv_loc, s64 vtxlayout_loc,
s64 fragtex_loc, s64 fragconst_loc, s64 vtxconst_loc, s64 index_loc,
s64 texture_loc, s64 rasterizer_loc, s64 instancing_loc)
s64 texture_loc, s64 rasterizer_loc)
{
attrib_heap_ptr = attrib_loc;
vtx_env_heap_ptr = vtxenv_loc;
@ -243,7 +241,6 @@ namespace vk
index_heap_ptr = index_loc;
texture_upload_heap_ptr = texture_loc;
rasterizer_env_heap_ptr = rasterizer_loc;
instancing_heap_ptr = instancing_loc;
last_frame_sync_time = rsx::get_shared_tag();
}

View File

@ -163,8 +163,7 @@ void VKGSRender::advance_queued_frames()
m_transform_constants_ring_info.get_current_put_pos_minus_one(),
m_index_buffer_ring_info.get_current_put_pos_minus_one(),
m_texture_upload_buffer_ring_info.get_current_put_pos_minus_one(),
m_raster_env_ring_info.get_current_put_pos_minus_one(),
m_instancing_buffer_ring_info.get_current_put_pos_minus_one());
m_raster_env_ring_info.get_current_put_pos_minus_one());
m_queued_frames.push_back(m_current_frame);
ensure(m_queued_frames.size() <= VK_MAX_ASYNC_FRAMES);
@ -267,8 +266,6 @@ void VKGSRender::frame_context_cleanup(vk::frame_context_t *ctx)
m_fragment_texture_params_ring_info.m_get_pos = ctx->frag_texparam_heap_ptr;
m_index_buffer_ring_info.m_get_pos = ctx->index_heap_ptr;
m_texture_upload_buffer_ring_info.m_get_pos = ctx->texture_upload_heap_ptr;
m_raster_env_ring_info.m_get_pos = ctx->rasterizer_env_heap_ptr;
m_instancing_buffer_ring_info.m_get_pos = ctx->instancing_heap_ptr;
m_attrib_ring_info.notify();
m_vertex_env_ring_info.notify();
@ -279,8 +276,6 @@ void VKGSRender::frame_context_cleanup(vk::frame_context_t *ctx)
m_fragment_texture_params_ring_info.notify();
m_index_buffer_ring_info.notify();
m_texture_upload_buffer_ring_info.notify();
m_raster_env_ring_info.notify();
m_instancing_buffer_ring_info.notify();
}
}

View File

@ -217,7 +217,7 @@ namespace
vk::vertex_upload_info VKGSRender::upload_vertex_data()
{
draw_command_visitor visitor(m_index_buffer_ring_info, m_vertex_layout);
auto result = std::visit(visitor, m_draw_processor.get_draw_command(rsx::method_registers));
auto result = std::visit(visitor, get_draw_command(rsx::method_registers));
const u32 vertex_count = (result.max_index - result.min_index) + 1;
u32 vertex_base = result.min_index;
@ -294,7 +294,7 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
const usz volatile_offset_in_block = volatile_offset - persistent_offset;
void *block_mapping = m_attrib_ring_info.map(persistent_offset, block_size);
m_draw_processor.write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, block_mapping, static_cast<char*>(block_mapping) + volatile_offset_in_block);
write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, block_mapping, static_cast<char*>(block_mapping) + volatile_offset_in_block);
m_attrib_ring_info.unmap();
}
else
@ -302,14 +302,14 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
if (required.first > 0 && persistent_offset != umax)
{
void *persistent_mapping = m_attrib_ring_info.map(persistent_offset, required.first);
m_draw_processor.write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping, nullptr);
write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping, nullptr);
m_attrib_ring_info.unmap();
}
if (required.second > 0)
{
void *volatile_mapping = m_attrib_ring_info.map(volatile_offset, required.second);
m_draw_processor.write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, nullptr, volatile_mapping);
write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, nullptr, volatile_mapping);
m_attrib_ring_info.unmap();
}
}

View File

@ -32,34 +32,31 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS)
OS << "#version 450\n\n";
OS << "#extension GL_ARB_separate_shader_objects : enable\n\n";
OS <<
"layout(std140, set = 0, binding = 0) uniform VertexContextBuffer\n"
"{\n"
" mat4 scale_offset_mat;\n"
" ivec4 user_clip_enabled[2];\n"
" vec4 user_clip_factor[2];\n"
" uint transform_branch_bits;\n"
" float point_size;\n"
" float z_near;\n"
" float z_far;\n"
"};\n\n";
OS << "layout(std140, set = 0, binding = 0) uniform VertexContextBuffer\n";
OS << "{\n";
OS << " mat4 scale_offset_mat;\n";
OS << " ivec4 user_clip_enabled[2];\n";
OS << " vec4 user_clip_factor[2];\n";
OS << " uint transform_branch_bits;\n";
OS << " float point_size;\n";
OS << " float z_near;\n";
OS << " float z_far;\n";
OS << "};\n\n";
if (m_device_props.emulate_conditional_rendering)
{
OS <<
"layout(std430, set = 0, binding = 8) readonly buffer EXT_Conditional_Rendering\n"
"{\n"
" uint conditional_rendering_predicate;\n"
"};\n\n";
OS << "layout(std430, set = 0, binding = 8) readonly buffer EXT_Conditional_Rendering\n";
OS << "{\n";
OS << " uint conditional_rendering_predicate;\n";
OS << "};\n\n";
}
OS <<
"layout(push_constant) uniform VertexLayoutBuffer\n"
"{\n"
" uint vertex_base_index;\n"
" uint vertex_index_offset;\n"
" uint draw_id;\n"
" uint layout_ptr_offset;\n";
OS << "layout(push_constant) uniform VertexLayoutBuffer\n";
OS << "{\n";
OS << " uint vertex_base_index;\n";
OS << " uint vertex_index_offset;\n";
OS << " uint draw_id;\n";
OS << " uint layout_ptr_offset;\n";
if (m_device_props.emulate_conditional_rendering)
{
@ -113,50 +110,18 @@ void VKVertexDecompilerThread::insertConstants(std::stringstream & OS, const std
{
if (PI.name.starts_with("vc["))
{
if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS))
{
OS << "layout(std140, set=0, binding=" << static_cast<int>(m_binding_table.vertex_constant_buffers_bind_slot) << ") uniform VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 " << PI.name << ";\n";
OS << "};\n\n";
OS << "layout(std140, set=0, binding = " << static_cast<int>(m_binding_table.vertex_constant_buffers_bind_slot) << ") uniform VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 " << PI.name << ";\n";
OS << "};\n\n";
in.location = m_binding_table.vertex_constant_buffers_bind_slot;
in.domain = glsl::glsl_vertex_program;
in.name = "VertexConstantsBuffer";
in.type = vk::glsl::input_type_uniform_buffer;
in.location = m_binding_table.vertex_constant_buffers_bind_slot;
in.domain = glsl::glsl_vertex_program;
in.name = "VertexConstantsBuffer";
in.type = vk::glsl::input_type_uniform_buffer;
inputs.push_back(in);
continue;
}
else
{
// 1. Bind indirection lookup buffer
OS << "layout(std430, set=0, binding=" << static_cast<int>(m_binding_table.instancing_lookup_table_bind_slot) << ") readonly buffer InstancingData\n";
OS << "{\n";
OS << " int constants_addressing_lookup[];\n";
OS << "};\n\n";
in.location = m_binding_table.instancing_lookup_table_bind_slot;
in.domain = glsl::glsl_vertex_program;
in.name = "InstancingData";
in.type = vk::glsl::input_type_storage_buffer;
inputs.push_back(in);
// 2. Bind actual constants buffer
OS << "layout(std430, set=0, binding=" << static_cast<int>(m_binding_table.instancing_constants_buffer_slot) << ") readonly buffer VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 instanced_constants_array[];\n";
OS << "};\n\n";
OS << "#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n";
in.location = m_binding_table.instancing_constants_buffer_slot;
in.domain = glsl::glsl_vertex_program;
in.name = "VertexConstantsBuffer";
in.type = vk::glsl::input_type_storage_buffer;
inputs.push_back(in);
continue;
}
inputs.push_back(in);
continue;
}
if (PT.type == "sampler2D" ||
@ -244,7 +209,6 @@ void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
properties2.emulate_depth_clip_only = vk::g_render_device->get_shader_types_support().allow_float64;
properties2.low_precision_tests = vk::is_NVIDIA(vk::get_driver_vendor());
properties2.require_explicit_invariance = (vk::is_NVIDIA(vk::get_driver_vendor()) && g_cfg.video.shader_precision != gpu_preset_level::low);
properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS);
glsl::insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_vulkan);

View File

@ -14,10 +14,8 @@ namespace vk
u8 vertex_buffers_first_bind_slot = 5;
u8 conditional_render_predicate_slot = 8;
u8 rasterizer_env_bind_slot = 9;
u8 instancing_lookup_table_bind_slot = 10;
u8 instancing_constants_buffer_slot = 11;
u8 textures_first_bind_slot = 12;
u8 vertex_textures_first_bind_slot = 12; // Invalid, has to be initialized properly
u8 textures_first_bind_slot = 10;
u8 vertex_textures_first_bind_slot = 10; // Invalid, has to be initialized properly
u8 total_descriptor_bindings = vertex_textures_first_bind_slot; // Invalid, has to be initialized properly
};
}

View File

@ -455,8 +455,7 @@ namespace gcm
RSX_SHADER_CONTROL_UNKNOWN1 = 0x8000, // seemingly set when srgb packer is used??
// Custom
RSX_SHADER_CONTROL_ATTRIBUTE_INTERPOLATION = 0x10000, // Rasterizing triangles and not lines or points
RSX_SHADER_CONTROL_INSTANCED_CONSTANTS = 0x20000, // Support instance ID offsets when loading constants
RSX_SHADER_CONTROL_ATTRIBUTE_INTERPOLATION = 0x10000 // Rasterizing triangles and not lines or points
};
// GCM Reports

View File

@ -104,7 +104,6 @@
<ClCompile Include="Emu\perf_monitor.cpp" />
<ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
<ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
<ClCompile Include="Emu\RSX\Core\RSXDrawCommands.cpp" />
<ClCompile Include="Emu\RSX\Host\MM.cpp" />
<ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
<ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
@ -620,8 +619,6 @@
<ClInclude Include="Emu\RSX\Common\time.hpp" />
<ClInclude Include="Emu\RSX\Common\unordered_map.hpp" />
<ClInclude Include="Emu\RSX\Core\RSXContext.h" />
<ClInclude Include="Emu\RSX\Core\RSXDrawCommands.h" />
<ClInclude Include="Emu\RSX\Core\RSXDriverState.h" />
<ClInclude Include="Emu\RSX\Core\RSXEngLock.hpp" />
<ClInclude Include="Emu\RSX\Core\RSXFrameBuffer.h" />
<ClInclude Include="Emu\RSX\Core\RSXIOMap.hpp" />

View File

@ -1315,9 +1315,6 @@
<ClCompile Include="Emu\RSX\Host\MM.cpp">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\Core\RSXDrawCommands.cpp">
<Filter>Emu\GPU\RSX\Core</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Crypto\aes.h">
@ -2653,12 +2650,6 @@
<ClInclude Include="Emu\RSX\Host\MM.h">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Core\RSXDrawCommands.h">
<Filter>Emu\GPU\RSX\Core</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Core\RSXDriverState.h">
<Filter>Emu\GPU\RSX\Core</Filter>
</ClInclude>
<ClInclude Include="Emu\NP\fb_helpers.h">
<Filter>Emu\NP</Filter>
</ClInclude>