diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 04ed28e10f..97016dbb04 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -558,6 +558,7 @@ target_sources(rpcs3_emu PRIVATE RSX/Program/CgBinaryFragmentProgram.cpp RSX/Program/CgBinaryVertexProgram.cpp RSX/Program/FragmentProgramDecompiler.cpp + RSX/Program/FragmentProgramRegister.cpp RSX/Program/GLSLCommon.cpp RSX/Program/ProgramStateCache.cpp RSX/Program/program_util.cpp diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index b3f9b88df2..4de2100ff2 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -22,6 +22,15 @@ namespace rsx using namespace rsx::fragment_program; +// SIMD vector lanes +enum VectorLane : u8 +{ + X = 0, + Y = 1, + Z = 2, + W = 3, +}; + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -141,8 +150,7 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) AddCode(m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "cc" + std::to_string(src0.cond_mod_reg_index)) + "$m = " + dest + ";"); } - u32 reg_index = dst.fp16 ? dst.dest_reg >> 1 : dst.dest_reg; - + const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg; ensure(reg_index < temp_registers.size()); if (dst.opcode == RSX_FP_OPCODE_MOV && @@ -754,14 +762,26 @@ std::string FragmentProgramDecompiler::BuildCode() const std::string init_value = float4_type + "(0.)"; std::array output_register_names; std::array ouput_register_indices = { 0, 2, 3, 4 }; - bool shader_is_valid = false; + + // Holder for any "cleanup" before exiting main + std::stringstream main_epilogue; // Check depth export if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) { // Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!! m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value); - shader_is_valid = (!!temp_registers[1].h1_writes); + + auto& r1 = temp_registers[1]; + if (r1.requires_gather(VectorLane::Z)) + { + // r1.zw was not written to + properties.has_gather_op = true; + main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n"; + + // Emit debug warning. Useful to diagnose regressions, but should be removed in future. + rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered."); + } } // Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z) @@ -777,26 +797,41 @@ std::string FragmentProgramDecompiler::BuildCode() for (int n = 0; n < 4; ++n) { - if (!m_parr.HasParam(PF_PARAM_NONE, float4_type, output_register_names[n])) + const auto& reg_name = output_register_names[n]; + if (!m_parr.HasParam(PF_PARAM_NONE, float4_type, reg_name)) { - m_parr.AddParam(PF_PARAM_NONE, float4_type, output_register_names[n], init_value); - continue; + m_parr.AddParam(PF_PARAM_NONE, float4_type, reg_name, init_value); } const auto block_index = ouput_register_indices[n]; - shader_is_valid |= (!!temp_registers[block_index].h0_writes); - } + auto& r = temp_registers[block_index]; - if (!shader_is_valid) - { - properties.has_no_output = true; - - if (!properties.has_discard_op) + if (fp16_out) { - // NOTE: Discard operation overrides output - rsx_log.warning("Shader does not write to any output register and will be NOPed"); - main = "/*" + main + "*/"; + // Check if we need a split/extract op + if (r.requires_split(0)) + { + main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n"; + + // Emit debug warning. Useful to diagnose regressions, but should be removed in future. + rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name); + } + + continue; } + + if (!r.requires_gather128()) + { + // Nothing to do + continue; + } + + // We need to gather the data from existing registers + main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n"; + properties.has_gather_op = true; + + // Emit debug warning. Useful to diagnose regressions, but should be removed in future. + rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name); } if (properties.has_dynamic_register_load) @@ -822,6 +857,9 @@ std::string FragmentProgramDecompiler::BuildCode() OS << "#endif\n"; OS << " discard;\n"; OS << "}\n"; + + // Don't consume any args + m_parr.Clear(); return OS.str(); } @@ -1019,6 +1057,12 @@ std::string FragmentProgramDecompiler::BuildCode() insertMainStart(OS); OS << main << std::endl; + + if (const auto epilogue = main_epilogue.str(); !epilogue.empty()) + { + OS << " // Epilogue\n"; + OS << epilogue << std::endl; + } insertMainEnd(OS); return OS.str(); @@ -1360,12 +1404,12 @@ std::string FragmentProgramDecompiler::Decompile() switch (opcode) { - case RSX_FP_OPCODE_NOP: break; + case RSX_FP_OPCODE_NOP: + break; case RSX_FP_OPCODE_KIL: properties.has_discard_op = true; AddFlowOp("_kill()"); break; - default: int prev_force_unit = forced_unit; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index df3fbe5e16..dab539f9da 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -1,116 +1,10 @@ #pragma once #include "ShaderParam.h" +#include "FragmentProgramRegister.h" #include "RSXFragmentProgram.h" + #include -// Helper for GPR occupancy tracking -struct temp_register -{ - bool aliased_r0 = false; - bool aliased_h0 = false; - bool aliased_h1 = false; - bool last_write_half[4] = { false, false, false, false }; - - u32 real_index = -1; - - u32 h0_writes = 0u; // Number of writes to the first 64-bits of the register - u32 h1_writes = 0u; // Number of writes to the last 64-bits of the register - - void tag(u32 index, bool half_register, bool x, bool y, bool z, bool w) - { - if (half_register) - { - if (index & 1) - { - if (x) last_write_half[2] = true; - if (y) last_write_half[2] = true; - if (z) last_write_half[3] = true; - if (w) last_write_half[3] = true; - - aliased_h1 = true; - h1_writes++; - } - else - { - if (x) last_write_half[0] = true; - if (y) last_write_half[0] = true; - if (z) last_write_half[1] = true; - if (w) last_write_half[1] = true; - - aliased_h0 = true; - h0_writes++; - } - } - else - { - if (x) last_write_half[0] = false; - if (y) last_write_half[1] = false; - if (z) last_write_half[2] = false; - if (w) last_write_half[3] = false; - - aliased_r0 = true; - - h0_writes++; - h1_writes++; - } - - if (real_index == umax) - { - if (half_register) - real_index = index >> 1; - else - real_index = index; - } - } - - bool requires_gather(u8 channel) const - { - //Data fetched from the single precision register requires merging of the two half registers - ensure(channel < 4); - if (aliased_h0 && channel < 2) - { - return last_write_half[channel]; - } - - if (aliased_h1 && channel > 1) - { - return last_write_half[channel]; - } - - return false; - } - - bool requires_split(u32 /*index*/) const - { - //Data fetched from any of the two half registers requires sync with the full register - if (!(last_write_half[0] || last_write_half[1]) && aliased_r0) - { - //r0 has been written to - //TODO: Check for specific elements in real32 register - return true; - } - - return false; - } - - std::string gather_r() const - { - std::string h0 = "h" + std::to_string(real_index << 1); - std::string h1 = "h" + std::to_string(real_index << 1 | 1); - std::string reg = "r" + std::to_string(real_index); - std::string ret = "//Invalid gather"; - - if (aliased_h0 && aliased_h1) - ret = "(gather(" + h0 + ", " + h1 + "))"; - else if (aliased_h0) - ret = "(gather(" + h0 + "), " + reg + ".zw)"; - else if (aliased_h1) - ret = "(" + reg + ".xy, gather(" + h1 + "))"; - - return ret; - } -}; - /** * This class is used to translate RSX Fragment program to GLSL/HLSL code * Backend with text based shader can subclass this class and implement : @@ -157,7 +51,7 @@ class FragmentProgramDecompiler bool m_is_valid_ucode = true; - std::array temp_registers; + std::array temp_registers; std::string GetMask() const; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp new file mode 100644 index 0000000000..a14b142df6 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp @@ -0,0 +1,196 @@ +#include "stdafx.h" +#include "FragmentProgramRegister.h" + +namespace rsx +{ + MixedPrecisionRegister::MixedPrecisionRegister() + { + std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined); + } + + void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w) + { + if (x) content_mask[0] = data_type_bits::f16; + if (y) content_mask[1] = data_type_bits::f16; + if (z) content_mask[2] = data_type_bits::f16; + if (w) content_mask[3] = data_type_bits::f16; + } + + void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w) + { + if (x) content_mask[4] = data_type_bits::f16; + if (y) content_mask[5] = data_type_bits::f16; + if (z) content_mask[6] = data_type_bits::f16; + if (w) content_mask[7] = data_type_bits::f16; + } + + void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w) + { + if (x) content_mask[0] = content_mask[1] = data_type_bits::f32; + if (y) content_mask[2] = content_mask[3] = data_type_bits::f32; + if (z) content_mask[4] = content_mask[5] = data_type_bits::f32; + if (w) content_mask[6] = content_mask[7] = data_type_bits::f32; + } + + void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w) + { + if (file_index == umax) + { + // First-time use. Initialize... + const u32 real_index = is_fp16 ? (index >> 1) : index; + file_index = real_index; + } + + if (is_fp16) + { + ensure((index / 2) == file_index); + + if (index & 1) + { + tag_h1(x, y, z, w); + return; + } + + tag_h0(x, y, z, w); + return; + } + + tag_r(x, y, z, w); + } + + std::string MixedPrecisionRegister::gather_r() const + { + const auto half_index = file_index << 1; + const std::string reg = "r" + std::to_string(file_index); + const std::string gather_half_regs[] = { + "gather(h" + std::to_string(half_index) + ")", + "gather(h" + std::to_string(half_index + 1) + ")" + }; + + std::string outputs[4]; + for (int ch = 0; ch < 4; ++ch) + { + // FIXME: This approach ignores mixed register bits. Not ideal!!!! + const auto channel0 = content_mask[ch * 2]; + const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16; + outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg; + } + + // Grouping. Only replace relevant bits... + if (outputs[0] == outputs[1]) outputs[0] = ""; + if (outputs[2] == outputs[3]) outputs[2] = ""; + + // Assemble + bool group = false; + std::string result = ""; + constexpr std::string_view swz_mask = "xyzw"; + + for (int ch = 0; ch < 4; ++ch) + { + if (outputs[ch].empty()) + { + group = true; + continue; + } + + if (!result.empty()) + { + result += ", "; + } + + if (group) + { + ensure(ch > 0); + group = false; + + if (outputs[ch] == reg) + { + result += reg + "." + swz_mask[ch - 1] + swz_mask[ch]; + continue; + } + + result += outputs[ch]; + continue; + } + + const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles + result += outputs[ch] + "." + swz_mask[subch]; + } + + // Optimize dual-gather (128-bit gather) to use special function + const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1]; + if (result == double_gather) + { + result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")"; + } + + return "(" + result + ")"; + } + + std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const + { + // Reads half-word 0 (H16x4) from a full real (R32x4) register + constexpr std::string_view swz_mask = "xyzw"; + const std::string reg = "r" + std::to_string(file_index); + const std::string hreg = "h" + std::to_string(file_index * 2 + word_index); + + const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")"; + const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")"; + const std::string words[] = { + "unpackHalf2x16(" + word0_bits + ")", + "unpackHalf2x16(" + word1_bits + ")" + }; + + // Assemble + std::string outputs[4]; + + ensure(word_index <= 1); + const int word_offset = word_index * 4; + for (int ch = 0; ch < 4; ++ch) + { + outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32 + ? words[ch / 2] + : hreg; + } + + // Grouping. Only replace relevant bits... + if (outputs[0] == outputs[1]) outputs[0] = ""; + if (outputs[2] == outputs[3]) outputs[2] = ""; + + // Assemble + bool group = false; + std::string result = ""; + + for (int ch = 0; ch < 4; ++ch) + { + if (outputs[ch].empty()) + { + group = true; + continue; + } + + if (!result.empty()) + { + result += ", "; + } + + if (group) + { + ensure(ch > 0); + group = false; + result += outputs[ch]; + + if (outputs[ch] == hreg) + { + result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch]; + } + continue; + } + + const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles + result += outputs[ch] + "." + swz_mask[subch]; + } + + return "(" + result + ")"; + } +} diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h new file mode 100644 index 0000000000..6cfc8e76c3 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h @@ -0,0 +1,111 @@ +#pragma once + +#include + +namespace rsx +{ + class MixedPrecisionRegister + { + enum data_type_bits + { + undefined = 0, + f16 = 1, + f32 = 2 + }; + + std::array content_mask; // Content details for each half-word + u32 file_index = umax; + + void tag_h0(bool x, bool y, bool z, bool w); + + void tag_h1(bool x, bool y, bool z, bool w); + + void tag_r(bool x, bool y, bool z, bool w); + + std::string fetch_halfreg(u32 word_index) const; + + public: + MixedPrecisionRegister(); + + void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w); + + std::string gather_r() const; + + std::string split_h0() const + { + return fetch_halfreg(0); + } + + std::string split_h1() const + { + return fetch_halfreg(1); + } + + // Getters + + // Return true if all values are unwritten to (undefined) + bool floating() const + { + return file_index == umax; + } + + // Return true if the first half register is all undefined + bool floating_h0() const + { + return content_mask[0] == content_mask[1] && + content_mask[1] == content_mask[2] && + content_mask[2] == content_mask[3] && + content_mask[3] == data_type_bits::undefined; + } + + // Return true if the second half register is all undefined + bool floating_h1() const + { + return content_mask[4] == content_mask[5] && + content_mask[5] == content_mask[6] && + content_mask[6] == content_mask[7] && + content_mask[7] == data_type_bits::undefined; + } + + // Return true if any of the half-words are 16-bit + bool requires_gather(u8 channel) const + { + // Data fetched from the single precision register requires merging of the two half registers + const auto channel_offset = channel * 2; + ensure(channel_offset <= 6); + + return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16); + } + + // Return true if the entire 128-bit register is filled with 2xfp16x4 data words + bool requires_gather128() const + { + // Full 128-bit check + for (const auto& ch : content_mask) + { + if (ch == data_type_bits::f16) + { + return true; + } + } + + return false; + } + + // Return true if the half-register is polluted with fp32 data + bool requires_split(u32 word_index) const + { + const u32 content_offset = word_index * 4; + for (u32 i = 0; i < 4; ++i) + { + if (content_mask[content_offset + i] == data_type_bits::f32) + { + return true; + } + } + + return false; + } + }; +} + diff --git a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp index 2f7330f8fa..8bc851ec28 100644 --- a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp +++ b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp @@ -555,10 +555,14 @@ usz fragment_program_storage_hash::operator()(const RSXFragmentProgram& program) bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, const RSXFragmentProgram& binary2) const { - if (binary1.ctrl != binary2.ctrl || binary1.texture_state != binary2.texture_state || + if (binary1.ucode_length != binary2.ucode_length || + binary1.ctrl != binary2.ctrl || + binary1.texture_state != binary2.texture_state || binary1.texcoord_control_mask != binary2.texcoord_control_mask || binary1.two_sided_lighting != binary2.two_sided_lighting) + { return false; + } const void* instBuffer1 = binary1.get_data(); const void* instBuffer2 = binary2.get_data(); @@ -569,7 +573,9 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con const auto inst2 = v128::loadu(instBuffer2, instIndex); if (inst1._u ^ inst2._u) + { return false; + } instIndex++; // Skip constants @@ -578,9 +584,11 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con fragment_program_utils::is_constant(inst1._u32[3])) instIndex++; - bool end = ((inst1._u32[0] >> 8) & 0x1) && ((inst2._u32[0] >> 8) & 0x1); + const bool end = ((inst1._u32[0] >> 8) & 0x1) && ((inst2._u32[0] >> 8) & 0x1); if (end) + { return true; + } } } diff --git a/rpcs3/Emu/RSX/Program/ShaderParam.h b/rpcs3/Emu/RSX/Program/ShaderParam.h index 3308ba6d9f..01e4931869 100644 --- a/rpcs3/Emu/RSX/Program/ShaderParam.h +++ b/rpcs3/Emu/RSX/Program/ShaderParam.h @@ -227,6 +227,14 @@ struct ParamArray return name; } + + void Clear() + { + for (auto& param : params) + { + param.clear(); + } + } }; class ShaderVariable diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 6834ed5530..bbb69c30b2 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -140,6 +140,7 @@ + @@ -669,6 +670,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index f36b57bb03..23c7c34fb6 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -1330,6 +1330,9 @@ Emu\Cell + + Emu\GPU\RSX\Program + @@ -2686,6 +2689,9 @@ Emu\Audio + + Emu\GPU\RSX\Program +