From d7be0a96f3423a5af1775205185e7838cd7b9e90 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 17 Dec 2018 02:05:26 +0300 Subject: [PATCH] SPU LLVM: approximate xfloat option Adapt previous SPU ASMJIT changes made by @kd-11 FM, FMA, FNMS, FMS are approximated. FCGT, FCMGT are accurate. --- rpcs3/Emu/Cell/SPURecompiler.cpp | 80 +++++++++++++++++++++++++++++++- rpcs3/Emu/System.h | 1 + 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 122fd275ec..cf818a85e3 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -4946,17 +4946,58 @@ public: void FCGT(spu_opcode_t op) // { if (g_cfg.core.spu_accurate_xfloat) + { set_vr(op.rt, sext(fcmp(get_vr(op.ra), get_vr(op.rb)))); + return; + } + + const auto a = get_vr(op.ra); + const auto b = get_vr(op.rb); + + // See FCMGT. + if (g_cfg.core.spu_approx_xfloat) + { + const auto ia = bitcast(fabs(a)); + const auto ib = bitcast(fabs(b)); + const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff)); + + // Use sign bits to invert abs values before comparison. + const auto ca = eval(ia ^ (bitcast(a) >> 31)); + const auto cb = eval(ib ^ (bitcast(b) >> 31)); + set_vr(op.rt, sext((ca > cb) & nz)); + } else - set_vr(op.rt, sext(fcmp(get_vr(op.ra), get_vr(op.rb)))); + { + set_vr(op.rt, sext(fcmp(a, b))); + } } void FCMGT(spu_opcode_t op) // { if (g_cfg.core.spu_accurate_xfloat) + { set_vr(op.rt, sext(fcmp(fabs(get_vr(op.ra)), fabs(get_vr(op.rb))))); + return; + } + + const auto a = get_vr(op.ra); + const auto b = get_vr(op.rb); + const auto abs_a = fabs(a); + const auto abs_b = fabs(b); + + // Actually, it's accurate and can be used as an alternative path for accurate xfloat. + if (g_cfg.core.spu_approx_xfloat) + { + // Compare abs values as integers, but return false if both are denormals or zeros. + const auto ia = bitcast(abs_a); + const auto ib = bitcast(abs_b); + const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff)); + set_vr(op.rt, sext((ia > ib) & nz)); + } else - set_vr(op.rt, sext(fcmp(fabs(get_vr(op.ra)), fabs(get_vr(op.rb))))); + { + set_vr(op.rt, sext(fcmp(abs_a, abs_b))); + } } void FA(spu_opcode_t op) // @@ -4979,6 +5020,26 @@ public: { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); + else if (g_cfg.core.spu_approx_xfloat) + { + const auto a = get_vr(op.ra); + const auto b = get_vr(op.rb); + const auto m = eval(a * b); + const auto abs_a = bitcast(fabs(a)); + const auto abs_b = bitcast(fabs(b)); + const auto abs_m = bitcast(fabs(m)); + const auto sign_a = eval(bitcast(a) & 0x80000000); + const auto sign_b = eval(bitcast(b) & 0x80000000); + const auto smod_m = eval(bitcast(m) & 0x7fffffff); + const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff); + const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff)); + + // If m produces Inf or NaN, flush it to max xfloat with appropriate sign + const auto clamp = select(smod_m > 0x7f7fffff, bitcast(fmax_m), m); + + // If a, b, or a * b is a denorm or zero, return zero + set_vr(op.rt, select(nzero, clamp, fsplat(0.))); + } else set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); } @@ -5040,11 +5101,22 @@ public: set_vr(op.rt, sext(fcmp(fabs(get_vr(op.ra)), fabs(get_vr(op.rb))))); } + // Multiply and return zero if any of the arguments is in the xfloat range. + value_t mzero_if_xtended(value_t a, value_t b) + { + // Compare absolute values with max positive float in normal range. + const auto aa = bitcast(fabs(a)); + const auto ab = bitcast(fabs(b)); + return select(eval(max(aa, ab) > 0x7f7fffff), fsplat(0.), eval(a * b)); + } + void FNMS(spu_opcode_t op) // { // See FMA. if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, -fmuladd(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); + else if (g_cfg.core.spu_approx_xfloat) + set_vr(op.rt4, get_vr(op.rc) - mzero_if_xtended(get_vr(op.ra), get_vr(op.rb))); else set_vr(op.rt4, get_vr(op.rc) - get_vr(op.ra) * get_vr(op.rb)); } @@ -5054,6 +5126,8 @@ public: // Hardware FMA produces the same result as multiple + add on the limited double range (xfloat). if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + else if (g_cfg.core.spu_approx_xfloat) + set_vr(op.rt4, mzero_if_xtended(get_vr(op.ra), get_vr(op.rb)) + get_vr(op.rc)); else set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rc)); } @@ -5063,6 +5137,8 @@ public: // See FMA. if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); + else if (g_cfg.core.spu_approx_xfloat) + set_vr(op.rt4, mzero_if_xtended(get_vr(op.ra), get_vr(op.rb)) - get_vr(op.rc)); else set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) - get_vr(op.rc)); } diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 7c5b8f04fb..7d507fa43f 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -378,6 +378,7 @@ struct cfg_root : cfg::node cfg::_bool spu_cache{this, "SPU Cache", true}; cfg::_enum enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false}; + cfg::_bool spu_approx_xfloat{this, "Approximate xfloat", true}; cfg::_enum lib_loading{this, "Lib Loader", lib_loading_type::liblv2only}; cfg::_bool hook_functions{this, "Hook static functions"};