mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-01 12:08:12 +00:00
Deterministic FREST and FRSQEST
This commit is contained in:
parent
db2341c842
commit
5c0113ce59
@ -8860,12 +8860,16 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
|
||||
if (g_cfg.net.net_active == np_internet_status::enabled)
|
||||
if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
|
||||
{
|
||||
return fsplat<f32[4]>(1.0) / value<f32[4]>(ci->getOperand(0));
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto acc_result = fsplat<f32[4]>(1.0) / a;
|
||||
// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
|
||||
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
|
||||
// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
|
||||
return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
|
||||
});
|
||||
}
|
||||
else
|
||||
@ -8873,6 +8877,7 @@ public:
|
||||
register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
// Fast but this makes the result vary per cpu
|
||||
return fre(a);
|
||||
});
|
||||
}
|
||||
@ -8895,12 +8900,16 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
|
||||
if (g_cfg.net.net_active == np_internet_status::enabled)
|
||||
if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
|
||||
{
|
||||
return fsplat<f32[4]>(1.0) / fsqrt(fabs(value<f32[4]>(ci->getOperand(0))));
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
|
||||
// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
|
||||
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
|
||||
// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
|
||||
return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
|
||||
});
|
||||
}
|
||||
else
|
||||
@ -8908,6 +8917,7 @@ public:
|
||||
register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
// Fast but this makes the result vary per cpu
|
||||
return frsqe(fabs(a));
|
||||
});
|
||||
}
|
||||
@ -9633,23 +9643,29 @@ public:
|
||||
return bitcast<f32[4]>((b & 0xff800000u) | (bitcast<u32[4]>(fpcast<f32[4]>(bnew)) & ~0xff800000u)); // Inject old sign and exponent
|
||||
});
|
||||
|
||||
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
|
||||
if (g_cfg.net.net_active == np_internet_status::enabled)
|
||||
if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
register_intrinsic("spu_re", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
return fsplat<f32[4]>(1.0) / a;
|
||||
const auto acc_result = fsplat<f32[4]>(1.0) / a;
|
||||
// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
|
||||
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
|
||||
return acc_result - acc_penalty;
|
||||
});
|
||||
|
||||
register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
return fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
|
||||
const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
|
||||
// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
|
||||
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
|
||||
return acc_result - acc_penalty;
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// For relaxed use intrinsics, those make the results vary per cpu
|
||||
register_intrinsic("spu_re", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
|
@ -68,7 +68,7 @@ struct cfg_root : cfg::node
|
||||
cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
|
||||
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
|
||||
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
|
||||
cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT" and "FNMS" instructions
|
||||
cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT", "FNMS", "FREST" AND "FRSQEST" instructions
|
||||
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
|
||||
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
|
||||
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };
|
||||
|
Loading…
Reference in New Issue
Block a user