PPU: implement quasi-accurate FRSQRTE

Denormals are handled like zeros. NaN handling is inaccurate in some cases. Co-authored-by: Nekotekina <nekotekina@gmail.com>
2024-11-17 08:11:51 +00:00 · 2019-12-25 15:12:48 +01:00 · 2019-12-25 15:12:48 +01:00 · 7b162c7513
commit 7b162c7513
parent 5b8f105308
4 changed files with 67 additions and 5 deletions
--- a/rpcs3/Emu/Cell/Common.h
+++ b/rpcs3/Emu/Cell/Common.h
@ -16,3 +16,54 @@ inline int fexpf(float x)
 {
 	return (std::bit_cast<u32>(x) >> 23) & 0xff;
 }
+
+constexpr u32 ppu_frsqrte_mantissas[16] =
+{
+	0x000f1000u, 0x000d8000u, 0x000c0000u, 0x000a8000u,
+	0x00098000u, 0x00088000u, 0x00080000u, 0x00070000u,
+	0x00060000u, 0x0004c000u, 0x0003c000u, 0x00030000u,
+	0x00020000u, 0x00018000u, 0x00010000u, 0x00008000u,
+};
+
+// Large lookup table for FRSQRTE instruction
+inline struct ppu_frsqrte_lut_t
+{
+	// Store only high 32 bits of doubles
+	u32 data[0x8000]{};
+
+	constexpr ppu_frsqrte_lut_t() noexcept
+	{
+		for (u64 i = 0; i < 0x8000; i++)
+		{
+			// Decomposed LUT index
+			const u64 sign = i >> 14;
+			const u64 expv = (i >> 3) & 0x7ff;
+
+			// (0x3FF - (((EXP_BITS(b) - 0x3FF) >> 1) + 1)) << 52
+			const u64 exp = 0x3fe0'0000 - (((expv + 0x1c01) >> 1) << (52 - 32));
+
+			if (expv == 0) // ±INF on zero/denormal, not accurate
+			{
+				data[i] = 0x7ff0'0000 | (sign << 31);
+			}
+			else if (expv == 0x7ff)
+			{
+				if (i == (0x7ff << 3))
+					data[i] = 0; // Zero on +INF, inaccurate
+				else
+					data[i] = 0x7ff8'0000; // QNaN
+			}
+			else if (sign)
+			{
+				data[i] = 0x7ff8'0000; // QNaN
+			}
+			else
+			{
+				// ((MAN_BITS(b) >> 49) & 7ull) + (!(EXP_BITS(b) & 1) << 3)
+				const u64 idx = 8 ^ (i & 0xf);
+
+				data[i] = ppu_frsqrte_mantissas[idx] | exp;
+			}
+		}
+	}
+} ppu_frqrte_lut;
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -6534,10 +6534,13 @@ auto FRSQRTE()
 	if constexpr (Build == 0xf1a6)
 		return ppu_exec_select<Flags...>::template select<set_fpcc>();

-	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
-	ppu.fpr[op.frd] = 1.0 / std::sqrt(ppu.fpr[op.frb]);
+	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
+	{
+		const u64 b = std::bit_cast<u64>(ppu.fpr[op.frb]);
+		ppu.fpr[op.frd] = std::bit_cast<f64>(u64{ppu_frqrte_lut.data[b >> 49]} << 32);
 		ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
 	};
+
 	RETURN_(ppu, op);
 }

--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -1,6 +1,7 @@
 #ifdef LLVM_AVAILABLE

 #include "Emu/system_config.h"
+#include "Emu/Cell/Common.h"
 #include "PPUTranslator.h"
 #include "PPUThread.h"

@ -4344,8 +4345,14 @@ void PPUTranslator::FMUL(ppu_opcode_t op)

 void PPUTranslator::FRSQRTE(ppu_opcode_t op)
 {
-	const auto b = GetFpr(op.frb, 32);
-	const auto result = m_ir->CreateFDiv(ConstantFP::get(GetType<f32>(), 1.0), Call(GetType<f32>(), "llvm.sqrt.f32", b));
+	if (!m_frsqrte_table)
+	{
+		m_frsqrte_table = new GlobalVariable(*m_module, ArrayType::get(GetType<u32>(), 0x8000), true, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, ppu_frqrte_lut.data));
+	}
+
+	const auto b = m_ir->CreateBitCast(GetFpr(op.frb), GetType<u64>());
+	const auto v = m_ir->CreateLoad(m_ir->CreateGEP(m_frsqrte_table, {m_ir->getInt64(0), m_ir->CreateLShr(b, 49)}));
+	const auto result = m_ir->CreateBitCast(m_ir->CreateShl(ZExt(v), 32), GetType<f64>());
 	SetFpr(op.frd, result);

 	//m_ir->CreateStore(GetUndef<bool>(), m_fpscr_fr);
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -55,6 +55,7 @@ class PPUTranslator final : public cpu_translator
 	llvm::StructType* m_thread_type;

 	llvm::Value* m_mtocr_table{};
+	llvm::Value* m_frsqrte_table{};

 	llvm::Value* m_globals[175];
 	llvm::Value** const m_g_cr = m_globals + 99;