rsx/fp: Re-design register write tracking

- Always collapse register writes when exporting FS outputs
This commit is contained in:
kd-11 2025-01-04 01:57:03 +03:00 committed by kd-11
parent 88e13d8326
commit dce0abc8b9
9 changed files with 400 additions and 130 deletions

View File

@ -558,6 +558,7 @@ target_sources(rpcs3_emu PRIVATE
RSX/Program/CgBinaryFragmentProgram.cpp
RSX/Program/CgBinaryVertexProgram.cpp
RSX/Program/FragmentProgramDecompiler.cpp
RSX/Program/FragmentProgramRegister.cpp
RSX/Program/GLSLCommon.cpp
RSX/Program/ProgramStateCache.cpp
RSX/Program/program_util.cpp

View File

@ -22,6 +22,15 @@ namespace rsx
using namespace rsx::fragment_program;
// SIMD vector lanes
enum VectorLane : u8
{
X = 0,
Y = 1,
Z = 2,
W = 3,
};
FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size)
: m_size(size)
, m_prog(prog)
@ -141,8 +150,7 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
AddCode(m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "cc" + std::to_string(src0.cond_mod_reg_index)) + "$m = " + dest + ";");
}
u32 reg_index = dst.fp16 ? dst.dest_reg >> 1 : dst.dest_reg;
const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg;
ensure(reg_index < temp_registers.size());
if (dst.opcode == RSX_FP_OPCODE_MOV &&
@ -754,14 +762,26 @@ std::string FragmentProgramDecompiler::BuildCode()
const std::string init_value = float4_type + "(0.)";
std::array<std::string, 4> output_register_names;
std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
bool shader_is_valid = false;
// Holder for any "cleanup" before exiting main
std::stringstream main_epilogue;
// Check depth export
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{
// Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!!
m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value);
shader_is_valid = (!!temp_registers[1].h1_writes);
auto& r1 = temp_registers[1];
if (r1.requires_gather(VectorLane::Z))
{
// r1.zw was not written to
properties.has_gather_op = true;
main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n";
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered.");
}
}
// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
@ -777,26 +797,41 @@ std::string FragmentProgramDecompiler::BuildCode()
for (int n = 0; n < 4; ++n)
{
if (!m_parr.HasParam(PF_PARAM_NONE, float4_type, output_register_names[n]))
const auto& reg_name = output_register_names[n];
if (!m_parr.HasParam(PF_PARAM_NONE, float4_type, reg_name))
{
m_parr.AddParam(PF_PARAM_NONE, float4_type, output_register_names[n], init_value);
continue;
m_parr.AddParam(PF_PARAM_NONE, float4_type, reg_name, init_value);
}
const auto block_index = ouput_register_indices[n];
shader_is_valid |= (!!temp_registers[block_index].h0_writes);
}
auto& r = temp_registers[block_index];
if (!shader_is_valid)
{
properties.has_no_output = true;
if (!properties.has_discard_op)
if (fp16_out)
{
// NOTE: Discard operation overrides output
rsx_log.warning("Shader does not write to any output register and will be NOPed");
main = "/*" + main + "*/";
// Check if we need a split/extract op
if (r.requires_split(0))
{
main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n";
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name);
}
continue;
}
if (!r.requires_gather128())
{
// Nothing to do
continue;
}
// We need to gather the data from existing registers
main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n";
properties.has_gather_op = true;
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name);
}
if (properties.has_dynamic_register_load)
@ -822,6 +857,9 @@ std::string FragmentProgramDecompiler::BuildCode()
OS << "#endif\n";
OS << " discard;\n";
OS << "}\n";
// Don't consume any args
m_parr.Clear();
return OS.str();
}
@ -1019,6 +1057,12 @@ std::string FragmentProgramDecompiler::BuildCode()
insertMainStart(OS);
OS << main << std::endl;
if (const auto epilogue = main_epilogue.str(); !epilogue.empty())
{
OS << " // Epilogue\n";
OS << epilogue << std::endl;
}
insertMainEnd(OS);
return OS.str();
@ -1360,12 +1404,12 @@ std::string FragmentProgramDecompiler::Decompile()
switch (opcode)
{
case RSX_FP_OPCODE_NOP: break;
case RSX_FP_OPCODE_NOP:
break;
case RSX_FP_OPCODE_KIL:
properties.has_discard_op = true;
AddFlowOp("_kill()");
break;
default:
int prev_force_unit = forced_unit;

View File

@ -1,116 +1,10 @@
#pragma once
#include "ShaderParam.h"
#include "FragmentProgramRegister.h"
#include "RSXFragmentProgram.h"
#include <sstream>
// Helper for GPR occupancy tracking
struct temp_register
{
bool aliased_r0 = false;
bool aliased_h0 = false;
bool aliased_h1 = false;
bool last_write_half[4] = { false, false, false, false };
u32 real_index = -1;
u32 h0_writes = 0u; // Number of writes to the first 64-bits of the register
u32 h1_writes = 0u; // Number of writes to the last 64-bits of the register
void tag(u32 index, bool half_register, bool x, bool y, bool z, bool w)
{
if (half_register)
{
if (index & 1)
{
if (x) last_write_half[2] = true;
if (y) last_write_half[2] = true;
if (z) last_write_half[3] = true;
if (w) last_write_half[3] = true;
aliased_h1 = true;
h1_writes++;
}
else
{
if (x) last_write_half[0] = true;
if (y) last_write_half[0] = true;
if (z) last_write_half[1] = true;
if (w) last_write_half[1] = true;
aliased_h0 = true;
h0_writes++;
}
}
else
{
if (x) last_write_half[0] = false;
if (y) last_write_half[1] = false;
if (z) last_write_half[2] = false;
if (w) last_write_half[3] = false;
aliased_r0 = true;
h0_writes++;
h1_writes++;
}
if (real_index == umax)
{
if (half_register)
real_index = index >> 1;
else
real_index = index;
}
}
bool requires_gather(u8 channel) const
{
//Data fetched from the single precision register requires merging of the two half registers
ensure(channel < 4);
if (aliased_h0 && channel < 2)
{
return last_write_half[channel];
}
if (aliased_h1 && channel > 1)
{
return last_write_half[channel];
}
return false;
}
bool requires_split(u32 /*index*/) const
{
//Data fetched from any of the two half registers requires sync with the full register
if (!(last_write_half[0] || last_write_half[1]) && aliased_r0)
{
//r0 has been written to
//TODO: Check for specific elements in real32 register
return true;
}
return false;
}
std::string gather_r() const
{
std::string h0 = "h" + std::to_string(real_index << 1);
std::string h1 = "h" + std::to_string(real_index << 1 | 1);
std::string reg = "r" + std::to_string(real_index);
std::string ret = "//Invalid gather";
if (aliased_h0 && aliased_h1)
ret = "(gather(" + h0 + ", " + h1 + "))";
else if (aliased_h0)
ret = "(gather(" + h0 + "), " + reg + ".zw)";
else if (aliased_h1)
ret = "(" + reg + ".xy, gather(" + h1 + "))";
return ret;
}
};
/**
* This class is used to translate RSX Fragment program to GLSL/HLSL code
* Backend with text based shader can subclass this class and implement :
@ -157,7 +51,7 @@ class FragmentProgramDecompiler
bool m_is_valid_ucode = true;
std::array<temp_register, 64> temp_registers;
std::array<rsx::MixedPrecisionRegister, 64> temp_registers;
std::string GetMask() const;

View File

@ -0,0 +1,196 @@
#include "stdafx.h"
#include "FragmentProgramRegister.h"
namespace rsx
{
MixedPrecisionRegister::MixedPrecisionRegister()
{
std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined);
}
void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w)
{
if (x) content_mask[0] = data_type_bits::f16;
if (y) content_mask[1] = data_type_bits::f16;
if (z) content_mask[2] = data_type_bits::f16;
if (w) content_mask[3] = data_type_bits::f16;
}
void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w)
{
if (x) content_mask[4] = data_type_bits::f16;
if (y) content_mask[5] = data_type_bits::f16;
if (z) content_mask[6] = data_type_bits::f16;
if (w) content_mask[7] = data_type_bits::f16;
}
void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w)
{
if (x) content_mask[0] = content_mask[1] = data_type_bits::f32;
if (y) content_mask[2] = content_mask[3] = data_type_bits::f32;
if (z) content_mask[4] = content_mask[5] = data_type_bits::f32;
if (w) content_mask[6] = content_mask[7] = data_type_bits::f32;
}
void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w)
{
if (file_index == umax)
{
// First-time use. Initialize...
const u32 real_index = is_fp16 ? (index >> 1) : index;
file_index = real_index;
}
if (is_fp16)
{
ensure((index / 2) == file_index);
if (index & 1)
{
tag_h1(x, y, z, w);
return;
}
tag_h0(x, y, z, w);
return;
}
tag_r(x, y, z, w);
}
std::string MixedPrecisionRegister::gather_r() const
{
const auto half_index = file_index << 1;
const std::string reg = "r" + std::to_string(file_index);
const std::string gather_half_regs[] = {
"gather(h" + std::to_string(half_index) + ")",
"gather(h" + std::to_string(half_index + 1) + ")"
};
std::string outputs[4];
for (int ch = 0; ch < 4; ++ch)
{
// FIXME: This approach ignores mixed register bits. Not ideal!!!!
const auto channel0 = content_mask[ch * 2];
const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16;
outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg;
}
// Grouping. Only replace relevant bits...
if (outputs[0] == outputs[1]) outputs[0] = "";
if (outputs[2] == outputs[3]) outputs[2] = "";
// Assemble
bool group = false;
std::string result = "";
constexpr std::string_view swz_mask = "xyzw";
for (int ch = 0; ch < 4; ++ch)
{
if (outputs[ch].empty())
{
group = true;
continue;
}
if (!result.empty())
{
result += ", ";
}
if (group)
{
ensure(ch > 0);
group = false;
if (outputs[ch] == reg)
{
result += reg + "." + swz_mask[ch - 1] + swz_mask[ch];
continue;
}
result += outputs[ch];
continue;
}
const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
result += outputs[ch] + "." + swz_mask[subch];
}
// Optimize dual-gather (128-bit gather) to use special function
const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1];
if (result == double_gather)
{
result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")";
}
return "(" + result + ")";
}
std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const
{
// Reads half-word 0 (H16x4) from a full real (R32x4) register
constexpr std::string_view swz_mask = "xyzw";
const std::string reg = "r" + std::to_string(file_index);
const std::string hreg = "h" + std::to_string(file_index * 2 + word_index);
const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")";
const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")";
const std::string words[] = {
"unpackHalf2x16(" + word0_bits + ")",
"unpackHalf2x16(" + word1_bits + ")"
};
// Assemble
std::string outputs[4];
ensure(word_index <= 1);
const int word_offset = word_index * 4;
for (int ch = 0; ch < 4; ++ch)
{
outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32
? words[ch / 2]
: hreg;
}
// Grouping. Only replace relevant bits...
if (outputs[0] == outputs[1]) outputs[0] = "";
if (outputs[2] == outputs[3]) outputs[2] = "";
// Assemble
bool group = false;
std::string result = "";
for (int ch = 0; ch < 4; ++ch)
{
if (outputs[ch].empty())
{
group = true;
continue;
}
if (!result.empty())
{
result += ", ";
}
if (group)
{
ensure(ch > 0);
group = false;
result += outputs[ch];
if (outputs[ch] == hreg)
{
result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch];
}
continue;
}
const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
result += outputs[ch] + "." + swz_mask[subch];
}
return "(" + result + ")";
}
}

View File

@ -0,0 +1,111 @@
#pragma once
#include <util/types.hpp>
namespace rsx
{
class MixedPrecisionRegister
{
enum data_type_bits
{
undefined = 0,
f16 = 1,
f32 = 2
};
std::array<data_type_bits, 8> content_mask; // Content details for each half-word
u32 file_index = umax;
void tag_h0(bool x, bool y, bool z, bool w);
void tag_h1(bool x, bool y, bool z, bool w);
void tag_r(bool x, bool y, bool z, bool w);
std::string fetch_halfreg(u32 word_index) const;
public:
MixedPrecisionRegister();
void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w);
std::string gather_r() const;
std::string split_h0() const
{
return fetch_halfreg(0);
}
std::string split_h1() const
{
return fetch_halfreg(1);
}
// Getters
// Return true if all values are unwritten to (undefined)
bool floating() const
{
return file_index == umax;
}
// Return true if the first half register is all undefined
bool floating_h0() const
{
return content_mask[0] == content_mask[1] &&
content_mask[1] == content_mask[2] &&
content_mask[2] == content_mask[3] &&
content_mask[3] == data_type_bits::undefined;
}
// Return true if the second half register is all undefined
bool floating_h1() const
{
return content_mask[4] == content_mask[5] &&
content_mask[5] == content_mask[6] &&
content_mask[6] == content_mask[7] &&
content_mask[7] == data_type_bits::undefined;
}
// Return true if any of the half-words are 16-bit
bool requires_gather(u8 channel) const
{
// Data fetched from the single precision register requires merging of the two half registers
const auto channel_offset = channel * 2;
ensure(channel_offset <= 6);
return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16);
}
// Return true if the entire 128-bit register is filled with 2xfp16x4 data words
bool requires_gather128() const
{
// Full 128-bit check
for (const auto& ch : content_mask)
{
if (ch == data_type_bits::f16)
{
return true;
}
}
return false;
}
// Return true if the half-register is polluted with fp32 data
bool requires_split(u32 word_index) const
{
const u32 content_offset = word_index * 4;
for (u32 i = 0; i < 4; ++i)
{
if (content_mask[content_offset + i] == data_type_bits::f32)
{
return true;
}
}
return false;
}
};
}

View File

@ -555,10 +555,14 @@ usz fragment_program_storage_hash::operator()(const RSXFragmentProgram& program)
bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, const RSXFragmentProgram& binary2) const
{
if (binary1.ctrl != binary2.ctrl || binary1.texture_state != binary2.texture_state ||
if (binary1.ucode_length != binary2.ucode_length ||
binary1.ctrl != binary2.ctrl ||
binary1.texture_state != binary2.texture_state ||
binary1.texcoord_control_mask != binary2.texcoord_control_mask ||
binary1.two_sided_lighting != binary2.two_sided_lighting)
{
return false;
}
const void* instBuffer1 = binary1.get_data();
const void* instBuffer2 = binary2.get_data();
@ -569,7 +573,9 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con
const auto inst2 = v128::loadu(instBuffer2, instIndex);
if (inst1._u ^ inst2._u)
{
return false;
}
instIndex++;
// Skip constants
@ -578,9 +584,11 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con
fragment_program_utils::is_constant(inst1._u32[3]))
instIndex++;
bool end = ((inst1._u32[0] >> 8) & 0x1) && ((inst2._u32[0] >> 8) & 0x1);
const bool end = ((inst1._u32[0] >> 8) & 0x1) && ((inst2._u32[0] >> 8) & 0x1);
if (end)
{
return true;
}
}
}

View File

@ -227,6 +227,14 @@ struct ParamArray
return name;
}
void Clear()
{
for (auto& param : params)
{
param.clear();
}
}
};
class ShaderVariable

View File

@ -140,6 +140,7 @@
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" />
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" />
<ClCompile Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.cpp" />
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" />
<ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" />
<ClCompile Include="Emu\RSX\Program\program_util.cpp" />
<ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp" />
@ -669,6 +670,7 @@
<ClInclude Include="Emu\RSX\Overlays\overlay_media_list_dialog.h" />
<ClInclude Include="Emu\RSX\Overlays\overlay_progress_bar.hpp" />
<ClInclude Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.h" />
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" />
<ClInclude Include="Emu\RSX\Program\GLSLTypes.h" />
<ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" />
<ClInclude Include="Emu\RSX\Program\program_util.h" />

View File

@ -1330,6 +1330,9 @@
<ClCompile Include="Emu\Cell\ErrorCodes.cpp">
<Filter>Emu\Cell</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp">
<Filter>Emu\GPU\RSX\Program</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Crypto\aes.h">
@ -2686,6 +2689,9 @@
<ClInclude Include="Emu\Audio\audio_utils.h">
<Filter>Emu\Audio</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h">
<Filter>Emu\GPU\RSX\Program</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">