mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-02-23 15:40:34 +00:00
rsx: Use native half float types if available
- Emulating f16 with f32 is not ideal and requires a lot of value clamping - Using native data type can significantly improve performance and accuracy - With openGL, check for the compatible extensions NV_gpu_shader5 and AMD_gpu_shader_half_float - With Vulkan, enable this functionality in the deviceFeatures if applicable. (VK_KHR_shader_float16_int8 extension) - Temporarily disable hw fp16 for vulkan
This commit is contained in:
parent
ee319f7c13
commit
a668560c68
@ -16,7 +16,6 @@ FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &p
|
|||||||
m_size = 0;
|
m_size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void FragmentProgramDecompiler::SetDst(std::string code, bool append_mask)
|
void FragmentProgramDecompiler::SetDst(std::string code, bool append_mask)
|
||||||
{
|
{
|
||||||
if (!src0.exec_if_eq && !src0.exec_if_gr && !src0.exec_if_lt) return;
|
if (!src0.exec_if_eq && !src0.exec_if_gr && !src0.exec_if_lt) return;
|
||||||
@ -45,6 +44,20 @@ void FragmentProgramDecompiler::SetDst(std::string code, bool append_mask)
|
|||||||
code = "((" + code + "- 0.5) * 2.)";
|
code = "((" + code + "- 0.5) * 2.)";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (dst.fp16 && device_props.has_native_half_support)
|
||||||
|
{
|
||||||
|
// Cast to native data type
|
||||||
|
if (dst.opcode == RSX_FP_OPCODE_NRM)
|
||||||
|
{
|
||||||
|
// Returns a 3-component vector as the result
|
||||||
|
code = ClampValue(code + ".xyzz", 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
code = ClampValue(code, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (dst.saturate)
|
if (dst.saturate)
|
||||||
{
|
{
|
||||||
code = saturate(code);
|
code = saturate(code);
|
||||||
@ -67,16 +80,20 @@ void FragmentProgramDecompiler::SetDst(std::string code, bool append_mask)
|
|||||||
case RSX_FP_OPCODE_LG2:
|
case RSX_FP_OPCODE_LG2:
|
||||||
break;
|
break;
|
||||||
case RSX_FP_OPCODE_MOV:
|
case RSX_FP_OPCODE_MOV:
|
||||||
//NOTE: Sometimes varying inputs from VS are out of range so do not exempt any input types, unless fp16 (Naruto UNS)
|
// NOTE: Sometimes varying inputs from VS are out of range so do not exempt any input types, unless fp16 (Naruto UNS)
|
||||||
if (dst.fp16 && src0.fp16 && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP)
|
if (dst.fp16 && src0.fp16 && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP)
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
//fp16 precsion flag on f32 register; ignore
|
// fp16 precsion flag on f32 register; ignore
|
||||||
if (dst.prec == 1 && !dst.fp16)
|
if (dst.prec == 1 && !dst.fp16)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
//clamp value to allowed range
|
// Native type already has fp16 clamped (input must have been cast)
|
||||||
|
if (dst.prec == 1 && dst.fp16 && device_props.has_native_half_support)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// clamp value to allowed range
|
||||||
code = ClampValue(code, dst.prec);
|
code = ClampValue(code, dst.prec);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -100,7 +117,7 @@ void FragmentProgramDecompiler::SetDst(std::string code, bool append_mask)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string dest = AddReg(dst.dest_reg, dst.fp16) + "$m";
|
std::string dest = AddReg(dst.dest_reg, !!dst.fp16) + "$m";
|
||||||
|
|
||||||
AddCodeCond(Format(dest), code);
|
AddCodeCond(Format(dest), code);
|
||||||
//AddCode("$ifcond " + dest + code + (append_mask ? "$m;" : ";"));
|
//AddCode("$ifcond " + dest + code + (append_mask ? "$m;" : ";"));
|
||||||
@ -159,15 +176,20 @@ std::string FragmentProgramDecompiler::GetMask()
|
|||||||
return ret.empty() || strncmp(ret.c_str(), dst_mask, 4) == 0 ? "" : ("." + ret);
|
return ret.empty() || strncmp(ret.c_str(), dst_mask, 4) == 0 ? "" : ("." + ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string FragmentProgramDecompiler::AddReg(u32 index, int fp16)
|
std::string FragmentProgramDecompiler::AddReg(u32 index, bool fp16)
|
||||||
{
|
{
|
||||||
return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), std::string(fp16 ? "h" : "r") + std::to_string(index), getFloatTypeName(4) + "(0., 0., 0., 0.)");
|
const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
|
||||||
|
const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index);
|
||||||
|
|
||||||
|
return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0., 0., 0., 0.)");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool FragmentProgramDecompiler::HasReg(u32 index, int fp16)
|
bool FragmentProgramDecompiler::HasReg(u32 index, bool fp16)
|
||||||
{
|
{
|
||||||
return m_parr.HasParam(PF_PARAM_NONE, getFloatTypeName(4),
|
const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
|
||||||
std::string(fp16 ? "h" : "r") + std::to_string(index));
|
const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index);
|
||||||
|
|
||||||
|
return m_parr.HasParam(PF_PARAM_NONE, type_name, reg_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string FragmentProgramDecompiler::AddCond()
|
std::string FragmentProgramDecompiler::AddCond()
|
||||||
@ -177,22 +199,23 @@ std::string FragmentProgramDecompiler::AddCond()
|
|||||||
|
|
||||||
std::string FragmentProgramDecompiler::AddConst()
|
std::string FragmentProgramDecompiler::AddConst()
|
||||||
{
|
{
|
||||||
std::string name = std::string("fc") + std::to_string(m_size + 4 * 4);
|
const std::string name = std::string("fc") + std::to_string(m_size + 4 * 4);
|
||||||
if (m_parr.HasParam(PF_PARAM_UNIFORM, getFloatTypeName(4), name))
|
const std::string type = getFloatTypeName(4);
|
||||||
|
|
||||||
|
if (m_parr.HasParam(PF_PARAM_UNIFORM, type, name))
|
||||||
{
|
{
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto data = (be_t<u32>*) ((char*)m_prog.addr + m_size + 4 * u32{sizeof(u32)});
|
auto data = (be_t<u32>*) ((char*)m_prog.addr + m_size + 4 * u32{sizeof(u32)});
|
||||||
|
|
||||||
m_offset = 2 * 4 * sizeof(u32);
|
m_offset = 2 * 4 * sizeof(u32);
|
||||||
u32 x = GetData(data[0]);
|
u32 x = GetData(data[0]);
|
||||||
u32 y = GetData(data[1]);
|
u32 y = GetData(data[1]);
|
||||||
u32 z = GetData(data[2]);
|
u32 z = GetData(data[2]);
|
||||||
u32 w = GetData(data[3]);
|
u32 w = GetData(data[3]);
|
||||||
return m_parr.AddParam(PF_PARAM_UNIFORM, getFloatTypeName(4), name,
|
|
||||||
std::string(getFloatTypeName(4) + "(") + std::to_string((float&)x) + ", " + std::to_string((float&)y)
|
const auto var = fmt::format("%s(%f, %f, %f, %f)", type, (f32&)x, (f32&)y, (f32&)z, (f32&)w);
|
||||||
+ ", " + std::to_string((float&)z) + ", " + std::to_string((float&)w) + ")");
|
return m_parr.AddParam(PF_PARAM_UNIFORM, type, name, var);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string FragmentProgramDecompiler::AddTex()
|
std::string FragmentProgramDecompiler::AddTex()
|
||||||
@ -240,9 +263,9 @@ std::string FragmentProgramDecompiler::NotZeroPositive(const std::string& code)
|
|||||||
|
|
||||||
std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision)
|
std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision)
|
||||||
{
|
{
|
||||||
//FP16 is expected to overflow a lot easier at 0+-65504
|
// FP16 is expected to overflow a lot easier at 0+-65504
|
||||||
//FP32 can still work up to 0+-3.4E38
|
// FP32 can still work up to 0+-3.4E38
|
||||||
//See http://http.download.nvidia.com/developer/Papers/2005/FP_Specials/FP_Specials.pdf
|
// See http://http.download.nvidia.com/developer/Papers/2005/FP_Specials/FP_Specials.pdf
|
||||||
|
|
||||||
switch (precision)
|
switch (precision)
|
||||||
{
|
{
|
||||||
@ -405,7 +428,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret += AddReg(src.tmp_reg_index, src.fp16);
|
ret += AddReg(src.tmp_reg_index, !!src.fp16);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case RSX_FP_REGISTER_TYPE_INPUT:
|
case RSX_FP_REGISTER_TYPE_INPUT:
|
||||||
@ -472,7 +495,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
|
|||||||
|
|
||||||
if (strncmp(swizzle.c_str(), f, 4) != 0) ret += "." + swizzle;
|
if (strncmp(swizzle.c_str(), f, 4) != 0) ret += "." + swizzle;
|
||||||
|
|
||||||
//Warning: Modifier order matters. e.g neg should be applied after precision clamping (tested with Naruto UNS)
|
// Warning: Modifier order matters. e.g neg should be applied after precision clamping (tested with Naruto UNS)
|
||||||
if (src.abs) ret = "abs(" + ret + ")";
|
if (src.abs) ret = "abs(" + ret + ")";
|
||||||
if (apply_precision_modifier) ret = ClampValue(ret, src1.input_prec_mod);
|
if (apply_precision_modifier) ret = ClampValue(ret, src1.input_prec_mod);
|
||||||
if (src.neg) ret = "-" + ret;
|
if (src.neg) ret = "-" + ret;
|
||||||
@ -485,7 +508,8 @@ std::string FragmentProgramDecompiler::BuildCode()
|
|||||||
// Shader validation
|
// Shader validation
|
||||||
// Shader must at least write to one output for the body to be considered valid
|
// Shader must at least write to one output for the body to be considered valid
|
||||||
|
|
||||||
const std::string vec4_type = getFloatTypeName(4);
|
const bool fp16_out = !(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
|
||||||
|
const std::string vec4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
|
||||||
const std::string init_value = vec4_type + "(0., 0., 0., 0.)";
|
const std::string init_value = vec4_type + "(0., 0., 0., 0.)";
|
||||||
std::array<std::string, 4> output_register_names;
|
std::array<std::string, 4> output_register_names;
|
||||||
std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
|
std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
|
||||||
@ -502,7 +526,7 @@ std::string FragmentProgramDecompiler::BuildCode()
|
|||||||
|
|
||||||
// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
|
// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
|
||||||
// This can be used instead of an explicit clear pass in some games (Motorstorm)
|
// This can be used instead of an explicit clear pass in some games (Motorstorm)
|
||||||
if (m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)
|
if (!fp16_out)
|
||||||
{
|
{
|
||||||
output_register_names = { "r0", "r2", "r3", "r4" };
|
output_register_names = { "r0", "r2", "r3", "r4" };
|
||||||
}
|
}
|
||||||
@ -548,6 +572,8 @@ std::string FragmentProgramDecompiler::BuildCode()
|
|||||||
// Insert global function definitions
|
// Insert global function definitions
|
||||||
insertGlobalFunctions(OS);
|
insertGlobalFunctions(OS);
|
||||||
|
|
||||||
|
if (!device_props.has_native_half_support)
|
||||||
|
{
|
||||||
// Accurate float to half clamping (preserves IEEE-754 NaN)
|
// Accurate float to half clamping (preserves IEEE-754 NaN)
|
||||||
OS <<
|
OS <<
|
||||||
"vec4 clamp16(vec4 x)\n"
|
"vec4 clamp16(vec4 x)\n"
|
||||||
@ -563,7 +589,15 @@ std::string FragmentProgramDecompiler::BuildCode()
|
|||||||
|
|
||||||
"vec3 clamp16(vec3 x){ return clamp16(x.xyzz).xyz; }\n"
|
"vec3 clamp16(vec3 x){ return clamp16(x.xyzz).xyz; }\n"
|
||||||
"vec2 clamp16(vec2 x){ return clamp16(x.xyxy).xy; }\n"
|
"vec2 clamp16(vec2 x){ return clamp16(x.xyxy).xy; }\n"
|
||||||
"float clamp16(float x){ return isnan(x)? x : clamp(x, -65504., +65504.); }\n";
|
"float clamp16(float x){ return isnan(x)? x : clamp(x, -65504., +65504.); }\n\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Define raw casts from f32->f16
|
||||||
|
const std::string half4 = getHalfTypeName(4);
|
||||||
|
OS << "#define clamp16(x) " << half4 << "(x)\n\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Declare register gather/merge if needed
|
// Declare register gather/merge if needed
|
||||||
if (properties.has_gather_op)
|
if (properties.has_gather_op)
|
||||||
@ -595,7 +629,7 @@ std::string FragmentProgramDecompiler::BuildCode()
|
|||||||
return OS.str();
|
return OS.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool FragmentProgramDecompiler::handle_sct(u32 opcode)
|
bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode)
|
||||||
{
|
{
|
||||||
switch (opcode)
|
switch (opcode)
|
||||||
{
|
{
|
||||||
@ -626,24 +660,9 @@ bool FragmentProgramDecompiler::handle_sct(u32 opcode)
|
|||||||
case RSX_FP_OPCODE_SLT: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SLT, "$0", "$1") + ")"); return true;
|
case RSX_FP_OPCODE_SLT: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SLT, "$0", "$1") + ")"); return true;
|
||||||
case RSX_FP_OPCODE_SNE: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SNE, "$0", "$1") + ")"); return true;
|
case RSX_FP_OPCODE_SNE: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SNE, "$0", "$1") + ")"); return true;
|
||||||
case RSX_FP_OPCODE_STR: SetDst(getFunction(FUNCTION::FUNCTION_STR)); return true;
|
case RSX_FP_OPCODE_STR: SetDst(getFunction(FUNCTION::FUNCTION_STR)); return true;
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool FragmentProgramDecompiler::handle_scb(u32 opcode)
|
// SCB-only ops
|
||||||
{
|
|
||||||
switch (opcode)
|
|
||||||
{
|
|
||||||
case RSX_FP_OPCODE_ADD: SetDst("($0 + $1)"); return true;
|
|
||||||
case RSX_FP_OPCODE_COS: SetDst("cos($0.xxxx)"); return true;
|
case RSX_FP_OPCODE_COS: SetDst("cos($0.xxxx)"); return true;
|
||||||
case RSX_FP_OPCODE_DIV: SetDst("($0 / " + NotZero("$1.x") + ")"); return true;
|
|
||||||
// Note: DIVSQ is not IEEE compliant. sqrt(0, 0) is 0 (Super Puzzle Fighter II Turbo HD Remix).
|
|
||||||
// sqrt(x, 0) might be equal to some big value (in absolute) whose sign is sign(x) but it has to be proven.
|
|
||||||
case RSX_FP_OPCODE_DIVSQ: SetDst("($0 / sqrt(" + NotZeroPositive("$1.x") + "))"); return true;
|
|
||||||
case RSX_FP_OPCODE_DP2: SetDst(getFunction(FUNCTION::FUNCTION_DP2)); return true;
|
|
||||||
case RSX_FP_OPCODE_DP3: SetDst(getFunction(FUNCTION::FUNCTION_DP3)); return true;
|
|
||||||
case RSX_FP_OPCODE_DP4: SetDst(getFunction(FUNCTION::FUNCTION_DP4)); return true;
|
|
||||||
case RSX_FP_OPCODE_DP2A: SetDst(getFunction(FUNCTION::FUNCTION_DP2A)); return true;
|
|
||||||
case RSX_FP_OPCODE_DST: SetDst("vec4(distance($0, $1))"); return true;
|
case RSX_FP_OPCODE_DST: SetDst("vec4(distance($0, $1))"); return true;
|
||||||
case RSX_FP_OPCODE_REFL: SetDst(getFunction(FUNCTION::FUNCTION_REFL)); return true;
|
case RSX_FP_OPCODE_REFL: SetDst(getFunction(FUNCTION::FUNCTION_REFL)); return true;
|
||||||
case RSX_FP_OPCODE_EX2: SetDst("exp2($0.xxxx)"); return true;
|
case RSX_FP_OPCODE_EX2: SetDst("exp2($0.xxxx)"); return true;
|
||||||
@ -656,11 +675,6 @@ bool FragmentProgramDecompiler::handle_scb(u32 opcode)
|
|||||||
case RSX_FP_OPCODE_LIF: SetDst(getFloatTypeName(4) + "(1.0, $0.y, ($0.y > 0 ? pow(2.0, $0.w) : 0.0), 1.0)"); return true;
|
case RSX_FP_OPCODE_LIF: SetDst(getFloatTypeName(4) + "(1.0, $0.y, ($0.y > 0 ? pow(2.0, $0.w) : 0.0), 1.0)"); return true;
|
||||||
case RSX_FP_OPCODE_LRP: SetDst(getFloatTypeName(4) + "($2 * (1 - $0) + $1 * $0)"); return true;
|
case RSX_FP_OPCODE_LRP: SetDst(getFloatTypeName(4) + "($2 * (1 - $0) + $1 * $0)"); return true;
|
||||||
case RSX_FP_OPCODE_LG2: SetDst("log2(" + NotZeroPositive("$0.x") + ").xxxx"); return true;
|
case RSX_FP_OPCODE_LG2: SetDst("log2(" + NotZeroPositive("$0.x") + ").xxxx"); return true;
|
||||||
case RSX_FP_OPCODE_MAD: SetDst("($0 * $1 + $2)"); return true;
|
|
||||||
case RSX_FP_OPCODE_MAX: SetDst("max($0, $1)"); return true;
|
|
||||||
case RSX_FP_OPCODE_MIN: SetDst("min($0, $1)"); return true;
|
|
||||||
case RSX_FP_OPCODE_MOV: SetDst("$0"); return true;
|
|
||||||
case RSX_FP_OPCODE_MUL: SetDst("($0 * $1)"); return true;
|
|
||||||
//Pack operations. See https://www.khronos.org/registry/OpenGL/extensions/NV/NV_fragment_program.txt
|
//Pack operations. See https://www.khronos.org/registry/OpenGL/extensions/NV/NV_fragment_program.txt
|
||||||
case RSX_FP_OPCODE_PK2: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packHalf2x16($0.xy)))"); return true;
|
case RSX_FP_OPCODE_PK2: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packHalf2x16($0.xy)))"); return true;
|
||||||
case RSX_FP_OPCODE_PK4: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packSnorm4x8($0)))"); return true;
|
case RSX_FP_OPCODE_PK4: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packSnorm4x8($0)))"); return true;
|
||||||
@ -668,15 +682,7 @@ bool FragmentProgramDecompiler::handle_scb(u32 opcode)
|
|||||||
case RSX_FP_OPCODE_PKG:
|
case RSX_FP_OPCODE_PKG:
|
||||||
//Should be similar to PKB but with gamma correction, see description of PK4UBG in khronos page
|
//Should be similar to PKB but with gamma correction, see description of PK4UBG in khronos page
|
||||||
case RSX_FP_OPCODE_PKB: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packUnorm4x8($0)))"); return true;
|
case RSX_FP_OPCODE_PKB: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packUnorm4x8($0)))"); return true;
|
||||||
case RSX_FP_OPCODE_SEQ: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SEQ, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_SFL: SetDst(getFunction(FUNCTION::FUNCTION_SFL)); return true;
|
|
||||||
case RSX_FP_OPCODE_SGE: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SGE, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_SGT: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SGT, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_SIN: SetDst("sin($0.xxxx)"); return true;
|
case RSX_FP_OPCODE_SIN: SetDst("sin($0.xxxx)"); return true;
|
||||||
case RSX_FP_OPCODE_SLE: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SLE, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_SLT: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SLT, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_SNE: SetDst(getFloatTypeName(4) + "(" + compareFunction(COMPARE::FUNCTION_SNE, "$0", "$1") + ")"); return true;
|
|
||||||
case RSX_FP_OPCODE_STR: SetDst(getFunction(FUNCTION::FUNCTION_STR)); return true;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -949,16 +955,14 @@ std::string FragmentProgramDecompiler::Decompile()
|
|||||||
default:
|
default:
|
||||||
int prev_force_unit = forced_unit;
|
int prev_force_unit = forced_unit;
|
||||||
|
|
||||||
//Some instructions do not respect forced unit
|
// Some instructions do not respect forced unit
|
||||||
//Tested with Tales of Vesperia
|
// Tested with Tales of Vesperia
|
||||||
if (SIP()) break;
|
if (SIP()) break;
|
||||||
if (handle_tex_srb(opcode)) break;
|
if (handle_tex_srb(opcode)) break;
|
||||||
|
|
||||||
//FENCT/FENCB do not actually reject instructions if they dont match the forced unit
|
// FENCT/FENCB do not actually reject instructions if they dont match the forced unit
|
||||||
//Tested with Dark Souls II where the respecting FENCX instruction will result in empty luminance averaging shaders
|
// Looks like they are optimization hints and not hard-coded forced paths
|
||||||
//TODO: More research is needed to determine what real HW does
|
if (handle_sct_scb(opcode)) break;
|
||||||
if (handle_sct(opcode)) break;
|
|
||||||
if (handle_scb(opcode)) break;
|
|
||||||
forced_unit = FORCE_NONE;
|
forced_unit = FORCE_NONE;
|
||||||
|
|
||||||
LOG_ERROR(RSX, "Unknown/illegal instruction: 0x%x (forced unit %d)", opcode, prev_force_unit);
|
LOG_ERROR(RSX, "Unknown/illegal instruction: 0x%x (forced unit %d)", opcode, prev_force_unit);
|
||||||
|
@ -3,24 +3,9 @@
|
|||||||
#include "Emu/RSX/RSXFragmentProgram.h"
|
#include "Emu/RSX/RSXFragmentProgram.h"
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
/**
|
// Helper for GPR occupancy tracking
|
||||||
* This class is used to translate RSX Fragment program to GLSL/HLSL code
|
struct temp_register
|
||||||
* Backend with text based shader can subclass this class and implement :
|
|
||||||
* - virtual std::string getFloatTypeName(size_t elementCount) = 0;
|
|
||||||
* - virtual std::string getFunction(enum class FUNCTION) = 0;
|
|
||||||
* - virtual std::string saturate(const std::string &code) = 0;
|
|
||||||
* - virtual std::string compareFunction(enum class COMPARE, const std::string &, const std::string &) = 0;
|
|
||||||
* - virtual void insertHeader(std::stringstream &OS) = 0;
|
|
||||||
* - virtual void insertInputs(std::stringstream &OS) = 0;
|
|
||||||
* - virtual void insertOutputs(std::stringstream &OS) = 0;
|
|
||||||
* - virtual void insertConstants(std::stringstream &OS) = 0;
|
|
||||||
* - virtual void insertMainStart(std::stringstream &OS) = 0;
|
|
||||||
* - virtual void insertMainEnd(std::stringstream &OS) = 0;
|
|
||||||
*/
|
|
||||||
class FragmentProgramDecompiler
|
|
||||||
{
|
{
|
||||||
struct temp_register
|
|
||||||
{
|
|
||||||
bool aliased_r0 = false;
|
bool aliased_r0 = false;
|
||||||
bool aliased_h0 = false;
|
bool aliased_h0 = false;
|
||||||
bool aliased_h1 = false;
|
bool aliased_h1 = false;
|
||||||
@ -124,8 +109,25 @@ class FragmentProgramDecompiler
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used to translate RSX Fragment program to GLSL/HLSL code
|
||||||
|
* Backend with text based shader can subclass this class and implement :
|
||||||
|
* - virtual std::string getFloatTypeName(size_t elementCount) = 0;
|
||||||
|
* - virtual std::string getHalfTypeName(size_t elementCount) = 0;
|
||||||
|
* - virtual std::string getFunction(enum class FUNCTION) = 0;
|
||||||
|
* - virtual std::string saturate(const std::string &code) = 0;
|
||||||
|
* - virtual std::string compareFunction(enum class COMPARE, const std::string &, const std::string &) = 0;
|
||||||
|
* - virtual void insertHeader(std::stringstream &OS) = 0;
|
||||||
|
* - virtual void insertInputs(std::stringstream &OS) = 0;
|
||||||
|
* - virtual void insertOutputs(std::stringstream &OS) = 0;
|
||||||
|
* - virtual void insertConstants(std::stringstream &OS) = 0;
|
||||||
|
* - virtual void insertMainStart(std::stringstream &OS) = 0;
|
||||||
|
* - virtual void insertMainEnd(std::stringstream &OS) = 0;
|
||||||
|
*/
|
||||||
|
class FragmentProgramDecompiler
|
||||||
|
{
|
||||||
OPDEST dst;
|
OPDEST dst;
|
||||||
SRC0 src0;
|
SRC0 src0;
|
||||||
SRC1 src1;
|
SRC1 src1;
|
||||||
@ -148,8 +150,8 @@ class FragmentProgramDecompiler
|
|||||||
|
|
||||||
void SetDst(std::string code, bool append_mask = true);
|
void SetDst(std::string code, bool append_mask = true);
|
||||||
void AddCode(const std::string& code);
|
void AddCode(const std::string& code);
|
||||||
std::string AddReg(u32 index, int fp16);
|
std::string AddReg(u32 index, bool fp16);
|
||||||
bool HasReg(u32 index, int fp16);
|
bool HasReg(u32 index, bool fp16);
|
||||||
std::string AddCond();
|
std::string AddCond();
|
||||||
std::string AddConst();
|
std::string AddConst();
|
||||||
std::string AddTex();
|
std::string AddTex();
|
||||||
@ -184,18 +186,11 @@ class FragmentProgramDecompiler
|
|||||||
u32 GetData(const u32 d) const { return d << 16 | d >> 16; }
|
u32 GetData(const u32 d) const { return d << 16 | d >> 16; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Emits code if opcode is an SCT one and returns true,
|
* Emits code if opcode is an SCT/SCB one and returns true,
|
||||||
* otherwise do nothing and return false.
|
* otherwise do nothing and return false.
|
||||||
* NOTE: What does SCT means ???
|
* NOTE: What does SCT means ???
|
||||||
*/
|
*/
|
||||||
bool handle_sct(u32 opcode);
|
bool handle_sct_scb(u32 opcode);
|
||||||
|
|
||||||
/**
|
|
||||||
* Emits code if opcode is an SCB one and returns true,
|
|
||||||
* otherwise do nothing and return false.
|
|
||||||
* NOTE: What does SCB means ???
|
|
||||||
*/
|
|
||||||
bool handle_scb(u32 opcode);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Emits code if opcode is an TEX SRB one and returns true,
|
* Emits code if opcode is an TEX SRB one and returns true,
|
||||||
@ -203,6 +198,7 @@ class FragmentProgramDecompiler
|
|||||||
* NOTE: What does TEX SRB means ???
|
* NOTE: What does TEX SRB means ???
|
||||||
*/
|
*/
|
||||||
bool handle_tex_srb(u32 opcode);
|
bool handle_tex_srb(u32 opcode);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
const RSXFragmentProgram &m_prog;
|
const RSXFragmentProgram &m_prog;
|
||||||
u32 m_ctrl = 0;
|
u32 m_ctrl = 0;
|
||||||
@ -214,6 +210,10 @@ protected:
|
|||||||
*/
|
*/
|
||||||
virtual std::string getFloatTypeName(size_t elementCount) = 0;
|
virtual std::string getFloatTypeName(size_t elementCount) = 0;
|
||||||
|
|
||||||
|
/** returns the type name of half vectors.
|
||||||
|
*/
|
||||||
|
virtual std::string getHalfTypeName(size_t elementCount) = 0;
|
||||||
|
|
||||||
/** returns string calling function where arguments are passed via
|
/** returns string calling function where arguments are passed via
|
||||||
* $0 $1 $2 substring.
|
* $0 $1 $2 substring.
|
||||||
*/
|
*/
|
||||||
@ -259,6 +259,12 @@ public:
|
|||||||
}
|
}
|
||||||
properties;
|
properties;
|
||||||
|
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
bool has_native_half_support = false;
|
||||||
|
}
|
||||||
|
device_props;
|
||||||
|
|
||||||
ParamArray m_parr;
|
ParamArray m_parr;
|
||||||
FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size);
|
FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size);
|
||||||
FragmentProgramDecompiler(const FragmentProgramDecompiler&) = delete;
|
FragmentProgramDecompiler(const FragmentProgramDecompiler&) = delete;
|
||||||
|
@ -111,6 +111,23 @@ namespace glsl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string getHalfTypeNameImpl(size_t elementCount)
|
||||||
|
{
|
||||||
|
switch (elementCount)
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
abort();
|
||||||
|
case 1:
|
||||||
|
return "float16_t";
|
||||||
|
case 2:
|
||||||
|
return "f16vec2";
|
||||||
|
case 3:
|
||||||
|
return "f16vec3";
|
||||||
|
case 4:
|
||||||
|
return "f16vec4";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string compareFunctionImpl(COMPARE f, const std::string &Op0, const std::string &Op1, bool scalar = false)
|
static std::string compareFunctionImpl(COMPARE f, const std::string &Op0, const std::string &Op1, bool scalar = false)
|
||||||
{
|
{
|
||||||
if (scalar)
|
if (scalar)
|
||||||
@ -372,7 +389,7 @@ namespace glsl
|
|||||||
"}\n\n";
|
"}\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void insert_rop(std::ostream& OS, bool _32_bit_exports)
|
static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support)
|
||||||
{
|
{
|
||||||
const std::string reg0 = _32_bit_exports ? "r0" : "h0";
|
const std::string reg0 = _32_bit_exports ? "r0" : "h0";
|
||||||
const std::string reg1 = _32_bit_exports ? "r2" : "h4";
|
const std::string reg1 = _32_bit_exports ? "r2" : "h4";
|
||||||
@ -398,7 +415,20 @@ namespace glsl
|
|||||||
|
|
||||||
if (!_32_bit_exports)
|
if (!_32_bit_exports)
|
||||||
{
|
{
|
||||||
//Tested using NPUB90375; some shaders (32-bit output only?) do not obey srgb flags
|
// Tested using NPUB90375; some shaders (32-bit output only?) do not obey srgb flags
|
||||||
|
if (native_half_support)
|
||||||
|
{
|
||||||
|
OS <<
|
||||||
|
" else if (srgb_convert)\n"
|
||||||
|
" {\n"
|
||||||
|
" " << reg0 << ".rgb = clamp16(linear_to_srgb(" << reg0 << ")).rgb;\n"
|
||||||
|
" " << reg1 << ".rgb = clamp16(linear_to_srgb(" << reg1 << ")).rgb;\n"
|
||||||
|
" " << reg2 << ".rgb = clamp16(linear_to_srgb(" << reg2 << ")).rgb;\n"
|
||||||
|
" " << reg3 << ".rgb = clamp16(linear_to_srgb(" << reg3 << ")).rgb;\n"
|
||||||
|
" }\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
OS <<
|
OS <<
|
||||||
" else if (srgb_convert)\n"
|
" else if (srgb_convert)\n"
|
||||||
" {\n"
|
" {\n"
|
||||||
@ -408,6 +438,7 @@ namespace glsl
|
|||||||
" " << reg3 << ".rgb = linear_to_srgb(" << reg3 << ").rgb;\n"
|
" " << reg3 << ".rgb = linear_to_srgb(" << reg3 << ").rgb;\n"
|
||||||
" }\n";
|
" }\n";
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
OS <<
|
OS <<
|
||||||
" }\n\n"
|
" }\n\n"
|
||||||
@ -468,7 +499,7 @@ namespace glsl
|
|||||||
// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
|
// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
|
||||||
// Helps to avoid A2C tested foliage disappearing in the distance
|
// Helps to avoid A2C tested foliage disappearing in the distance
|
||||||
OS <<
|
OS <<
|
||||||
"bool coverage_test_passes(inout vec4 _sample, uint control)\n"
|
"bool coverage_test_passes(/*inout*/in vec4 _sample, uint control)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" if ((control & 0x1) == 0) return false;\n"
|
" if ((control & 0x1) == 0) return false;\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
@ -18,6 +18,11 @@ std::string D3D12FragmentDecompiler::getFloatTypeName(size_t elementCount)
|
|||||||
return getFloatTypeNameImp(elementCount);
|
return getFloatTypeNameImp(elementCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string D3D12FragmentDecompiler::getHalfTypeName(size_t elementCount)
|
||||||
|
{
|
||||||
|
return getFloatTypeNameImp(elementCount);
|
||||||
|
}
|
||||||
|
|
||||||
std::string D3D12FragmentDecompiler::getFunction(enum class FUNCTION f)
|
std::string D3D12FragmentDecompiler::getFunction(enum class FUNCTION f)
|
||||||
{
|
{
|
||||||
return getFunctionImp(f);
|
return getFunctionImp(f);
|
||||||
|
@ -9,6 +9,7 @@ class D3D12FragmentDecompiler : public FragmentProgramDecompiler
|
|||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
virtual std::string getFloatTypeName(size_t elementCount) override;
|
virtual std::string getFloatTypeName(size_t elementCount) override;
|
||||||
|
virtual std::string getHalfTypeName(size_t elementCount) override;
|
||||||
virtual std::string getFunction(enum class FUNCTION) override;
|
virtual std::string getFunction(enum class FUNCTION) override;
|
||||||
virtual std::string saturate(const std::string &code) override;
|
virtual std::string saturate(const std::string &code) override;
|
||||||
virtual std::string compareFunction(enum class COMPARE, const std::string &, const std::string &) override;
|
virtual std::string compareFunction(enum class COMPARE, const std::string &, const std::string &) override;
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include "Emu/Memory/vm.h"
|
#include "Emu/Memory/vm.h"
|
||||||
#include "Emu/System.h"
|
#include "Emu/System.h"
|
||||||
|
#include "GLHelpers.h"
|
||||||
#include "GLFragmentProgram.h"
|
#include "GLFragmentProgram.h"
|
||||||
#include "../Common/ProgramStateCache.h"
|
|
||||||
#include "GLCommonDecompiler.h"
|
#include "GLCommonDecompiler.h"
|
||||||
#include "../GCM.h"
|
#include "../GCM.h"
|
||||||
|
|
||||||
@ -13,6 +13,11 @@ std::string GLFragmentDecompilerThread::getFloatTypeName(size_t elementCount)
|
|||||||
return glsl::getFloatTypeNameImpl(elementCount);
|
return glsl::getFloatTypeNameImpl(elementCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string GLFragmentDecompilerThread::getHalfTypeName(size_t elementCount)
|
||||||
|
{
|
||||||
|
return glsl::getHalfTypeNameImpl(elementCount);
|
||||||
|
}
|
||||||
|
|
||||||
std::string GLFragmentDecompilerThread::getFunction(FUNCTION f)
|
std::string GLFragmentDecompilerThread::getFunction(FUNCTION f)
|
||||||
{
|
{
|
||||||
return glsl::getFunctionImpl(f);
|
return glsl::getFunctionImpl(f);
|
||||||
@ -31,6 +36,19 @@ std::string GLFragmentDecompilerThread::compareFunction(COMPARE f, const std::st
|
|||||||
void GLFragmentDecompilerThread::insertHeader(std::stringstream & OS)
|
void GLFragmentDecompilerThread::insertHeader(std::stringstream & OS)
|
||||||
{
|
{
|
||||||
OS << "#version 430\n";
|
OS << "#version 430\n";
|
||||||
|
|
||||||
|
if (device_props.has_native_half_support)
|
||||||
|
{
|
||||||
|
const auto driver_caps = gl::get_driver_caps();
|
||||||
|
if (driver_caps.NV_gpu_shader5_supported)
|
||||||
|
{
|
||||||
|
OS << "#extension GL_NV_gpu_shader5: require\n";
|
||||||
|
}
|
||||||
|
else if (driver_caps.AMD_gpu_shader_half_float_supported)
|
||||||
|
{
|
||||||
|
OS << "#extension GL_AMD_gpu_shader_half_float: require\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GLFragmentDecompilerThread::insertInputs(std::stringstream & OS)
|
void GLFragmentDecompilerThread::insertInputs(std::stringstream & OS)
|
||||||
@ -92,9 +110,10 @@ void GLFragmentDecompilerThread::insertOutputs(std::stringstream & OS)
|
|||||||
{ "ocol3", m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS ? "r4" : "h8" },
|
{ "ocol3", m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS ? "r4" : "h8" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto reg_type = (m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) ? "vec4" : getHalfTypeName(4);
|
||||||
for (int i = 0; i < std::size(table); ++i)
|
for (int i = 0; i < std::size(table); ++i)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", table[i].second))
|
if (m_parr.HasParam(PF_PARAM_NONE, reg_type, table[i].second))
|
||||||
OS << "layout(location=" << i << ") out vec4 " << table[i].first << ";\n";
|
OS << "layout(location=" << i << ") out vec4 " << table[i].first << ";\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,14 +225,16 @@ void GLFragmentDecompilerThread::insertMainStart(std::stringstream & OS)
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::string parameters = "";
|
std::string parameters = "";
|
||||||
|
const auto half4 = getHalfTypeName(4);
|
||||||
for (auto ®_name : output_values)
|
for (auto ®_name : output_values)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", reg_name))
|
const auto type = (reg_name[0] == 'r' || !device_props.has_native_half_support)? "vec4" : half4;
|
||||||
|
if (m_parr.HasParam(PF_PARAM_NONE, type, reg_name))
|
||||||
{
|
{
|
||||||
if (parameters.length())
|
if (parameters.length())
|
||||||
parameters += ", ";
|
parameters += ", ";
|
||||||
|
|
||||||
parameters += "inout vec4 " + reg_name;
|
parameters += "inout " + type + " " + reg_name;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -307,21 +328,24 @@ void GLFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
|
|||||||
OS << "{\n";
|
OS << "{\n";
|
||||||
|
|
||||||
std::string parameters = "";
|
std::string parameters = "";
|
||||||
|
const auto half4 = getHalfTypeName(4);
|
||||||
|
|
||||||
for (auto ®_name : output_values)
|
for (auto ®_name : output_values)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", reg_name))
|
const std::string type = (reg_name[0] == 'r' || !device_props.has_native_half_support)? "vec4" : half4;
|
||||||
|
if (m_parr.HasParam(PF_PARAM_NONE, type, reg_name))
|
||||||
{
|
{
|
||||||
if (parameters.length())
|
if (parameters.length())
|
||||||
parameters += ", ";
|
parameters += ", ";
|
||||||
|
|
||||||
parameters += reg_name;
|
parameters += reg_name;
|
||||||
OS << " vec4 " << reg_name << " = vec4(0.);\n";
|
OS << " " << type << " " << reg_name << " = " << type << "(0.);\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OS << "\n" << " fs_main(" + parameters + ");\n\n";
|
OS << "\n" << " fs_main(" + parameters + ");\n\n";
|
||||||
|
|
||||||
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS));
|
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
|
||||||
|
|
||||||
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
|
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
|
||||||
{
|
{
|
||||||
@ -359,7 +383,15 @@ void GLFragmentProgram::Decompile(const RSXFragmentProgram& prog)
|
|||||||
{
|
{
|
||||||
u32 size;
|
u32 size;
|
||||||
GLFragmentDecompilerThread decompiler(shader, parr, prog, size);
|
GLFragmentDecompilerThread decompiler(shader, parr, prog, size);
|
||||||
|
|
||||||
|
if (!g_cfg.video.disable_native_float16)
|
||||||
|
{
|
||||||
|
const auto driver_caps = gl::get_driver_caps();
|
||||||
|
decompiler.device_props.has_native_half_support = driver_caps.NV_gpu_shader5_supported || driver_caps.AMD_gpu_shader_half_float_supported;
|
||||||
|
}
|
||||||
|
|
||||||
decompiler.Task();
|
decompiler.Task();
|
||||||
|
|
||||||
for (const ParamType& PT : decompiler.m_parr.params[PF_PARAM_UNIFORM])
|
for (const ParamType& PT : decompiler.m_parr.params[PF_PARAM_UNIFORM])
|
||||||
{
|
{
|
||||||
for (const ParamItem& PI : PT.items)
|
for (const ParamItem& PI : PT.items)
|
||||||
|
@ -20,6 +20,7 @@ public:
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual std::string getFloatTypeName(size_t elementCount) override;
|
virtual std::string getFloatTypeName(size_t elementCount) override;
|
||||||
|
virtual std::string getHalfTypeName(size_t elementCount) override;
|
||||||
virtual std::string getFunction(FUNCTION) override;
|
virtual std::string getFunction(FUNCTION) override;
|
||||||
virtual std::string saturate(const std::string &code) override;
|
virtual std::string saturate(const std::string &code) override;
|
||||||
virtual std::string compareFunction(COMPARE, const std::string&, const std::string&) override;
|
virtual std::string compareFunction(COMPARE, const std::string&, const std::string&) override;
|
||||||
|
@ -102,15 +102,28 @@ namespace gl
|
|||||||
bool ARB_depth_buffer_float_supported = false;
|
bool ARB_depth_buffer_float_supported = false;
|
||||||
bool ARB_texture_barrier_supported = false;
|
bool ARB_texture_barrier_supported = false;
|
||||||
bool NV_texture_barrier_supported = false;
|
bool NV_texture_barrier_supported = false;
|
||||||
|
bool NV_gpu_shader5_supported = false;
|
||||||
|
bool AMD_gpu_shader_half_float_supported = false;
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
bool vendor_INTEL = false; //has broken GLSL compiler
|
bool vendor_INTEL = false; // has broken GLSL compiler
|
||||||
bool vendor_AMD = false; //has broken ARB_multidraw
|
bool vendor_AMD = false; // has broken ARB_multidraw
|
||||||
bool vendor_NVIDIA = false; //has NaN poisoning issues
|
bool vendor_NVIDIA = false; // has NaN poisoning issues
|
||||||
bool vendor_MESA = false; //requires CLIENT_STORAGE bit set for streaming buffers
|
bool vendor_MESA = false; // requires CLIENT_STORAGE bit set for streaming buffers
|
||||||
|
|
||||||
|
bool check(const std::string& ext_name, const char* test)
|
||||||
|
{
|
||||||
|
if (ext_name == test)
|
||||||
|
{
|
||||||
|
LOG_NOTICE(RSX, "Extension %s is supported", ext_name);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void initialize()
|
void initialize()
|
||||||
{
|
{
|
||||||
int find_count = 8;
|
int find_count = 10;
|
||||||
int ext_count = 0;
|
int ext_count = 0;
|
||||||
glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count);
|
glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count);
|
||||||
|
|
||||||
@ -120,64 +133,78 @@ namespace gl
|
|||||||
|
|
||||||
const std::string ext_name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i));
|
const std::string ext_name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i));
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_shader_draw_parameters")
|
if (check(ext_name, "GL_ARB_shader_draw_parameters"))
|
||||||
{
|
{
|
||||||
ARB_shader_draw_parameters_supported = true;
|
ARB_shader_draw_parameters_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_EXT_direct_state_access")
|
if (check(ext_name, "GL_EXT_direct_state_access"))
|
||||||
{
|
{
|
||||||
EXT_dsa_supported = true;
|
EXT_dsa_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_direct_state_access")
|
if (check(ext_name, "GL_ARB_direct_state_access"))
|
||||||
{
|
{
|
||||||
ARB_dsa_supported = true;
|
ARB_dsa_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_buffer_storage")
|
if (check(ext_name, "GL_ARB_buffer_storage"))
|
||||||
{
|
{
|
||||||
ARB_buffer_storage_supported = true;
|
ARB_buffer_storage_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_texture_buffer_object")
|
if (check(ext_name, "GL_ARB_texture_buffer_object"))
|
||||||
{
|
{
|
||||||
ARB_texture_buffer_supported = true;
|
ARB_texture_buffer_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_depth_buffer_float")
|
if (check(ext_name, "GL_ARB_depth_buffer_float"))
|
||||||
{
|
{
|
||||||
ARB_depth_buffer_float_supported = true;
|
ARB_depth_buffer_float_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_ARB_texture_barrier")
|
if (check(ext_name, "GL_ARB_texture_barrier"))
|
||||||
{
|
{
|
||||||
ARB_texture_barrier_supported = true;
|
ARB_texture_barrier_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ext_name == "GL_NV_texture_barrier")
|
if (check(ext_name, "GL_NV_texture_barrier"))
|
||||||
{
|
{
|
||||||
NV_texture_barrier_supported = true;
|
NV_texture_barrier_supported = true;
|
||||||
find_count--;
|
find_count--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (check(ext_name, "GL_NV_gpu_shader5"))
|
||||||
|
{
|
||||||
|
NV_gpu_shader5_supported = true;
|
||||||
|
find_count--;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Workaround for intel drivers which have terrible capability reporting
|
if (check(ext_name, "GL_AMD_gpu_shader_half_float"))
|
||||||
|
{
|
||||||
|
AMD_gpu_shader_half_float_supported = true;
|
||||||
|
find_count--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Workaround for intel drivers which have terrible capability reporting
|
||||||
std::string vendor_string = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
std::string vendor_string = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
||||||
if (!vendor_string.empty())
|
if (!vendor_string.empty())
|
||||||
{
|
{
|
||||||
|
@ -12,6 +12,11 @@ std::string VKFragmentDecompilerThread::getFloatTypeName(size_t elementCount)
|
|||||||
return glsl::getFloatTypeNameImpl(elementCount);
|
return glsl::getFloatTypeNameImpl(elementCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string VKFragmentDecompilerThread::getHalfTypeName(size_t elementCount)
|
||||||
|
{
|
||||||
|
return glsl::getHalfTypeNameImpl(elementCount);
|
||||||
|
}
|
||||||
|
|
||||||
std::string VKFragmentDecompilerThread::getFunction(FUNCTION f)
|
std::string VKFragmentDecompilerThread::getFunction(FUNCTION f)
|
||||||
{
|
{
|
||||||
return glsl::getFunctionImpl(f);
|
return glsl::getFunctionImpl(f);
|
||||||
@ -29,7 +34,16 @@ std::string VKFragmentDecompilerThread::compareFunction(COMPARE f, const std::st
|
|||||||
|
|
||||||
void VKFragmentDecompilerThread::insertHeader(std::stringstream & OS)
|
void VKFragmentDecompilerThread::insertHeader(std::stringstream & OS)
|
||||||
{
|
{
|
||||||
|
if (device_props.has_native_half_support)
|
||||||
|
{
|
||||||
|
OS << "#version 450\n";
|
||||||
|
OS << "#extension GL_KHX_shader_explicit_arithmetic_types_float16: enable\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
OS << "#version 420\n";
|
OS << "#version 420\n";
|
||||||
|
}
|
||||||
|
|
||||||
OS << "#extension GL_ARB_separate_shader_objects: enable\n\n";
|
OS << "#extension GL_ARB_separate_shader_objects: enable\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,9 +107,10 @@ void VKFragmentDecompilerThread::insertOutputs(std::stringstream & OS)
|
|||||||
|
|
||||||
//NOTE: We do not skip outputs, the only possible combinations are a(0), b(0), ab(0,1), abc(0,1,2), abcd(0,1,2,3)
|
//NOTE: We do not skip outputs, the only possible combinations are a(0), b(0), ab(0,1), abc(0,1,2), abcd(0,1,2,3)
|
||||||
u8 output_index = 0;
|
u8 output_index = 0;
|
||||||
|
const auto reg_type = (m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) ? "vec4" : getHalfTypeName(4);
|
||||||
for (int i = 0; i < std::size(table); ++i)
|
for (int i = 0; i < std::size(table); ++i)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", table[i].second))
|
if (m_parr.HasParam(PF_PARAM_NONE, reg_type, table[i].second))
|
||||||
{
|
{
|
||||||
OS << "layout(location=" << std::to_string(output_index++) << ") " << "out vec4 " << table[i].first << ";\n";
|
OS << "layout(location=" << std::to_string(output_index++) << ") " << "out vec4 " << table[i].first << ";\n";
|
||||||
vk_prog->output_color_masks[i] = UINT32_MAX;
|
vk_prog->output_color_masks[i] = UINT32_MAX;
|
||||||
@ -242,14 +257,16 @@ void VKFragmentDecompilerThread::insertMainStart(std::stringstream & OS)
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::string parameters = "";
|
std::string parameters = "";
|
||||||
|
const auto half4 = getHalfTypeName(4);
|
||||||
for (auto ®_name : output_values)
|
for (auto ®_name : output_values)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", reg_name))
|
const auto type = (reg_name[0] == 'r' || !device_props.has_native_half_support)? "vec4" : half4;
|
||||||
|
if (m_parr.HasParam(PF_PARAM_NONE, type, reg_name))
|
||||||
{
|
{
|
||||||
if (parameters.length())
|
if (parameters.length())
|
||||||
parameters += ", ";
|
parameters += ", ";
|
||||||
|
|
||||||
parameters += "inout vec4 " + reg_name;
|
parameters += "inout " + type + " " + reg_name;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,21 +363,24 @@ void VKFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
|
|||||||
OS << "{\n";
|
OS << "{\n";
|
||||||
|
|
||||||
std::string parameters = "";
|
std::string parameters = "";
|
||||||
|
const auto half4 = getHalfTypeName(4);
|
||||||
|
|
||||||
for (auto ®_name : output_values)
|
for (auto ®_name : output_values)
|
||||||
{
|
{
|
||||||
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", reg_name))
|
const std::string type = (reg_name[0] == 'r' || !device_props.has_native_half_support)? "vec4" : half4;
|
||||||
|
if (m_parr.HasParam(PF_PARAM_NONE, type, reg_name))
|
||||||
{
|
{
|
||||||
if (parameters.length())
|
if (parameters.length())
|
||||||
parameters += ", ";
|
parameters += ", ";
|
||||||
|
|
||||||
parameters += reg_name;
|
parameters += reg_name;
|
||||||
OS << " vec4 " << reg_name << " = vec4(0.);\n";
|
OS << " " << type << " " << reg_name << " = " << type << "(0.);\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OS << "\n" << " fs_main(" + parameters + ");\n\n";
|
OS << "\n" << " fs_main(" + parameters + ");\n\n";
|
||||||
|
|
||||||
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS));
|
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support);
|
||||||
|
|
||||||
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
|
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
|
||||||
{
|
{
|
||||||
@ -400,6 +420,12 @@ void VKFragmentProgram::Decompile(const RSXFragmentProgram& prog)
|
|||||||
u32 size;
|
u32 size;
|
||||||
std::string source;
|
std::string source;
|
||||||
VKFragmentDecompilerThread decompiler(source, parr, prog, size, *this);
|
VKFragmentDecompilerThread decompiler(source, parr, prog, size, *this);
|
||||||
|
|
||||||
|
if (!g_cfg.video.disable_native_float16)
|
||||||
|
{
|
||||||
|
decompiler.device_props.has_native_half_support = vk::get_current_renderer()->get_shader_types_support().allow_float16;
|
||||||
|
}
|
||||||
|
|
||||||
decompiler.Task();
|
decompiler.Task();
|
||||||
|
|
||||||
shader.create(::glsl::program_domain::glsl_fragment_program, source);
|
shader.create(::glsl::program_domain::glsl_fragment_program, source);
|
||||||
|
@ -24,6 +24,7 @@ public:
|
|||||||
const std::vector<vk::glsl::program_input>& get_inputs() { return inputs; }
|
const std::vector<vk::glsl::program_input>& get_inputs() { return inputs; }
|
||||||
protected:
|
protected:
|
||||||
virtual std::string getFloatTypeName(size_t elementCount) override;
|
virtual std::string getFloatTypeName(size_t elementCount) override;
|
||||||
|
virtual std::string getHalfTypeName(size_t elementCount) override;
|
||||||
virtual std::string getFunction(FUNCTION) override;
|
virtual std::string getFunction(FUNCTION) override;
|
||||||
virtual std::string saturate(const std::string &code) override;
|
virtual std::string saturate(const std::string &code) override;
|
||||||
virtual std::string compareFunction(COMPARE, const std::string&, const std::string&) override;
|
virtual std::string compareFunction(COMPARE, const std::string&, const std::string&) override;
|
||||||
|
@ -201,6 +201,12 @@ namespace vk
|
|||||||
bool bgra8_linear;
|
bool bgra8_linear;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct gpu_shader_types_support
|
||||||
|
{
|
||||||
|
bool allow_float16;
|
||||||
|
bool allow_int8;
|
||||||
|
};
|
||||||
|
|
||||||
// Memory Allocator - base class
|
// Memory Allocator - base class
|
||||||
|
|
||||||
class mem_allocator_base
|
class mem_allocator_base
|
||||||
@ -515,9 +521,32 @@ namespace vk
|
|||||||
physical_device *pgpu = nullptr;
|
physical_device *pgpu = nullptr;
|
||||||
memory_type_mapping memory_map{};
|
memory_type_mapping memory_map{};
|
||||||
gpu_formats_support m_formats_support{};
|
gpu_formats_support m_formats_support{};
|
||||||
|
gpu_shader_types_support m_shader_types_support{};
|
||||||
std::unique_ptr<mem_allocator_base> m_allocator;
|
std::unique_ptr<mem_allocator_base> m_allocator;
|
||||||
VkDevice dev = VK_NULL_HANDLE;
|
VkDevice dev = VK_NULL_HANDLE;
|
||||||
|
|
||||||
|
void get_physical_device_features(VkPhysicalDeviceFeatures& features)
|
||||||
|
{
|
||||||
|
if (!vkGetPhysicalDeviceFeatures2)
|
||||||
|
{
|
||||||
|
vkGetPhysicalDeviceFeatures(*pgpu, &features);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
VkPhysicalDeviceFloat16Int8FeaturesKHR shader_support_info{};
|
||||||
|
shader_support_info.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
|
||||||
|
|
||||||
|
VkPhysicalDeviceFeatures2 features2;
|
||||||
|
features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
|
||||||
|
features2.pNext = &shader_support_info;
|
||||||
|
vkGetPhysicalDeviceFeatures2(*pgpu, &features2);
|
||||||
|
|
||||||
|
m_shader_types_support.allow_float16 = false;//!!shader_support_info.shaderFloat16;
|
||||||
|
m_shader_types_support.allow_int8 = !!shader_support_info.shaderInt8;
|
||||||
|
features = features2.features;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
render_device()
|
render_device()
|
||||||
{}
|
{}
|
||||||
@ -549,7 +578,7 @@ namespace vk
|
|||||||
//2. DXT support
|
//2. DXT support
|
||||||
//3. Indexable storage buffers
|
//3. Indexable storage buffers
|
||||||
VkPhysicalDeviceFeatures available_features;
|
VkPhysicalDeviceFeatures available_features;
|
||||||
vkGetPhysicalDeviceFeatures(*pgpu, &available_features);
|
get_physical_device_features(available_features);
|
||||||
|
|
||||||
available_features.samplerAnisotropy = VK_TRUE;
|
available_features.samplerAnisotropy = VK_TRUE;
|
||||||
available_features.textureCompressionBC = VK_TRUE;
|
available_features.textureCompressionBC = VK_TRUE;
|
||||||
@ -566,6 +595,21 @@ namespace vk
|
|||||||
device.ppEnabledExtensionNames = requested_extensions;
|
device.ppEnabledExtensionNames = requested_extensions;
|
||||||
device.pEnabledFeatures = &available_features;
|
device.pEnabledFeatures = &available_features;
|
||||||
|
|
||||||
|
VkPhysicalDeviceFloat16Int8FeaturesKHR shader_support_info{};
|
||||||
|
if (m_shader_types_support.allow_float16)
|
||||||
|
{
|
||||||
|
// Allow use of f16 type in shaders if possible
|
||||||
|
shader_support_info.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
|
||||||
|
shader_support_info.shaderFloat16 = VK_TRUE;
|
||||||
|
device.pNext = &shader_support_info;
|
||||||
|
|
||||||
|
LOG_NOTICE(RSX, "GPU/driver supports float16 data types natively. Using native float16_t variables if possible.");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LOG_NOTICE(RSX, "GPU/driver lacks support for float16 data types. All float16_t arithmetic will be emulated with float32_t.");
|
||||||
|
}
|
||||||
|
|
||||||
CHECK_RESULT(vkCreateDevice(*pgpu, &device, nullptr, &dev));
|
CHECK_RESULT(vkCreateDevice(*pgpu, &device, nullptr, &dev));
|
||||||
|
|
||||||
memory_map = vk::get_memory_mapping(pdev);
|
memory_map = vk::get_memory_mapping(pdev);
|
||||||
@ -634,6 +678,11 @@ namespace vk
|
|||||||
return m_formats_support;
|
return m_formats_support;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const gpu_shader_types_support& get_shader_types_support() const
|
||||||
|
{
|
||||||
|
return m_shader_types_support;
|
||||||
|
}
|
||||||
|
|
||||||
mem_allocator_base* get_allocator() const
|
mem_allocator_base* get_allocator() const
|
||||||
{
|
{
|
||||||
return m_allocator.get();
|
return m_allocator.get();
|
||||||
|
@ -14,6 +14,21 @@
|
|||||||
#include "define_new_memleakdetect.h"
|
#include "define_new_memleakdetect.h"
|
||||||
#include "Utilities/types.h"
|
#include "Utilities/types.h"
|
||||||
|
|
||||||
|
// TODO: Remove when packages catch up, ubuntu is stuck at 1.1.73 (bionic) and 1.1.82 (cosmic)
|
||||||
|
// Do we still use libvulkan-dev package on travis??????
|
||||||
|
#if VK_HEADER_VERSION < 95
|
||||||
|
|
||||||
|
typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR {
|
||||||
|
VkStructureType sType;
|
||||||
|
void* pNext;
|
||||||
|
VkBool32 shaderFloat16;
|
||||||
|
VkBool32 shaderInt8;
|
||||||
|
} VkPhysicalDeviceFloat16Int8FeaturesKHR;
|
||||||
|
|
||||||
|
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR VkStructureType(1000082000)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace vk
|
namespace vk
|
||||||
{
|
{
|
||||||
void init();
|
void init();
|
||||||
|
@ -448,6 +448,7 @@ struct cfg_root : cfg::node
|
|||||||
cfg::_bool full_rgb_range_output{this, "Use full RGB output range", true}; // Video out dynamic range
|
cfg::_bool full_rgb_range_output{this, "Use full RGB output range", true}; // Video out dynamic range
|
||||||
cfg::_bool disable_asynchronous_shader_compiler{this, "Disable Asynchronous Shader Compiler", false};
|
cfg::_bool disable_asynchronous_shader_compiler{this, "Disable Asynchronous Shader Compiler", false};
|
||||||
cfg::_bool strict_texture_flushing{this, "Strict Texture Flushing", false};
|
cfg::_bool strict_texture_flushing{this, "Strict Texture Flushing", false};
|
||||||
|
cfg::_bool disable_native_float16{this, "Disable native float16 support", false};
|
||||||
cfg::_int<1, 8> consequtive_frames_to_draw{this, "Consecutive Frames To Draw", 1};
|
cfg::_int<1, 8> consequtive_frames_to_draw{this, "Consecutive Frames To Draw", 1};
|
||||||
cfg::_int<1, 8> consequtive_frames_to_skip{this, "Consecutive Frames To Skip", 1};
|
cfg::_int<1, 8> consequtive_frames_to_skip{this, "Consecutive Frames To Skip", 1};
|
||||||
cfg::_int<50, 800> resolution_scale_percent{this, "Resolution Scale", 100};
|
cfg::_int<50, 800> resolution_scale_percent{this, "Resolution Scale", 100};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user