From 7b162c75133663ce84272f9e339fbc8f3fb3b6af Mon Sep 17 00:00:00 2001 From: doesthisusername Date: Wed, 25 Dec 2019 15:12:48 +0100 Subject: [PATCH] PPU: implement quasi-accurate FRSQRTE Denormals are handled like zeros. NaN handling is inaccurate in some cases. Co-authored-by: Nekotekina --- rpcs3/Emu/Cell/Common.h | 51 +++++++++++++++++++++++++++++++ rpcs3/Emu/Cell/PPUInterpreter.cpp | 9 ++++-- rpcs3/Emu/Cell/PPUTranslator.cpp | 11 +++++-- rpcs3/Emu/Cell/PPUTranslator.h | 1 + 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/Cell/Common.h b/rpcs3/Emu/Cell/Common.h index 37cbba3844..be87f5abf2 100644 --- a/rpcs3/Emu/Cell/Common.h +++ b/rpcs3/Emu/Cell/Common.h @@ -16,3 +16,54 @@ inline int fexpf(float x) { return (std::bit_cast(x) >> 23) & 0xff; } + +constexpr u32 ppu_frsqrte_mantissas[16] = +{ + 0x000f1000u, 0x000d8000u, 0x000c0000u, 0x000a8000u, + 0x00098000u, 0x00088000u, 0x00080000u, 0x00070000u, + 0x00060000u, 0x0004c000u, 0x0003c000u, 0x00030000u, + 0x00020000u, 0x00018000u, 0x00010000u, 0x00008000u, +}; + +// Large lookup table for FRSQRTE instruction +inline struct ppu_frsqrte_lut_t +{ + // Store only high 32 bits of doubles + u32 data[0x8000]{}; + + constexpr ppu_frsqrte_lut_t() noexcept + { + for (u64 i = 0; i < 0x8000; i++) + { + // Decomposed LUT index + const u64 sign = i >> 14; + const u64 expv = (i >> 3) & 0x7ff; + + // (0x3FF - (((EXP_BITS(b) - 0x3FF) >> 1) + 1)) << 52 + const u64 exp = 0x3fe0'0000 - (((expv + 0x1c01) >> 1) << (52 - 32)); + + if (expv == 0) // ±INF on zero/denormal, not accurate + { + data[i] = 0x7ff0'0000 | (sign << 31); + } + else if (expv == 0x7ff) + { + if (i == (0x7ff << 3)) + data[i] = 0; // Zero on +INF, inaccurate + else + data[i] = 0x7ff8'0000; // QNaN + } + else if (sign) + { + data[i] = 0x7ff8'0000; // QNaN + } + else + { + // ((MAN_BITS(b) >> 49) & 7ull) + (!(EXP_BITS(b) & 1) << 3) + const u64 idx = 8 ^ (i & 0xf); + + data[i] = ppu_frsqrte_mantissas[idx] | exp; + } + } + } +} ppu_frqrte_lut; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 2f78420666..e2f0af0be1 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -6534,10 +6534,13 @@ auto FRSQRTE() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.fpr[op.frd] = 1.0 / std::sqrt(ppu.fpr[op.frb]); - ppu_set_fpcc(ppu, ppu.fpr[op.frd], 0.); + static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) + { + const u64 b = std::bit_cast(ppu.fpr[op.frb]); + ppu.fpr[op.frd] = std::bit_cast(u64{ppu_frqrte_lut.data[b >> 49]} << 32); + ppu_set_fpcc(ppu, ppu.fpr[op.frd], 0.); }; + RETURN_(ppu, op); } diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 4eb4814bce..f0871b5892 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1,6 +1,7 @@ #ifdef LLVM_AVAILABLE #include "Emu/system_config.h" +#include "Emu/Cell/Common.h" #include "PPUTranslator.h" #include "PPUThread.h" @@ -4344,8 +4345,14 @@ void PPUTranslator::FMUL(ppu_opcode_t op) void PPUTranslator::FRSQRTE(ppu_opcode_t op) { - const auto b = GetFpr(op.frb, 32); - const auto result = m_ir->CreateFDiv(ConstantFP::get(GetType(), 1.0), Call(GetType(), "llvm.sqrt.f32", b)); + if (!m_frsqrte_table) + { + m_frsqrte_table = new GlobalVariable(*m_module, ArrayType::get(GetType(), 0x8000), true, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, ppu_frqrte_lut.data)); + } + + const auto b = m_ir->CreateBitCast(GetFpr(op.frb), GetType()); + const auto v = m_ir->CreateLoad(m_ir->CreateGEP(m_frsqrte_table, {m_ir->getInt64(0), m_ir->CreateLShr(b, 49)})); + const auto result = m_ir->CreateBitCast(m_ir->CreateShl(ZExt(v), 32), GetType()); SetFpr(op.frd, result); //m_ir->CreateStore(GetUndef(), m_fpscr_fr); diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index 23e2fecb84..bcaddbb389 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -55,6 +55,7 @@ class PPUTranslator final : public cpu_translator llvm::StructType* m_thread_type; llvm::Value* m_mtocr_table{}; + llvm::Value* m_frsqrte_table{}; llvm::Value* m_globals[175]; llvm::Value** const m_g_cr = m_globals + 99;