mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-17 08:11:51 +00:00
PPU: implement quasi-accurate FRSQRTE
Denormals are handled like zeros. NaN handling is inaccurate in some cases. Co-authored-by: Nekotekina <nekotekina@gmail.com>
This commit is contained in:
parent
5b8f105308
commit
7b162c7513
@ -16,3 +16,54 @@ inline int fexpf(float x)
|
||||
{
|
||||
return (std::bit_cast<u32>(x) >> 23) & 0xff;
|
||||
}
|
||||
|
||||
constexpr u32 ppu_frsqrte_mantissas[16] =
|
||||
{
|
||||
0x000f1000u, 0x000d8000u, 0x000c0000u, 0x000a8000u,
|
||||
0x00098000u, 0x00088000u, 0x00080000u, 0x00070000u,
|
||||
0x00060000u, 0x0004c000u, 0x0003c000u, 0x00030000u,
|
||||
0x00020000u, 0x00018000u, 0x00010000u, 0x00008000u,
|
||||
};
|
||||
|
||||
// Large lookup table for FRSQRTE instruction
|
||||
inline struct ppu_frsqrte_lut_t
|
||||
{
|
||||
// Store only high 32 bits of doubles
|
||||
u32 data[0x8000]{};
|
||||
|
||||
constexpr ppu_frsqrte_lut_t() noexcept
|
||||
{
|
||||
for (u64 i = 0; i < 0x8000; i++)
|
||||
{
|
||||
// Decomposed LUT index
|
||||
const u64 sign = i >> 14;
|
||||
const u64 expv = (i >> 3) & 0x7ff;
|
||||
|
||||
// (0x3FF - (((EXP_BITS(b) - 0x3FF) >> 1) + 1)) << 52
|
||||
const u64 exp = 0x3fe0'0000 - (((expv + 0x1c01) >> 1) << (52 - 32));
|
||||
|
||||
if (expv == 0) // ±INF on zero/denormal, not accurate
|
||||
{
|
||||
data[i] = 0x7ff0'0000 | (sign << 31);
|
||||
}
|
||||
else if (expv == 0x7ff)
|
||||
{
|
||||
if (i == (0x7ff << 3))
|
||||
data[i] = 0; // Zero on +INF, inaccurate
|
||||
else
|
||||
data[i] = 0x7ff8'0000; // QNaN
|
||||
}
|
||||
else if (sign)
|
||||
{
|
||||
data[i] = 0x7ff8'0000; // QNaN
|
||||
}
|
||||
else
|
||||
{
|
||||
// ((MAN_BITS(b) >> 49) & 7ull) + (!(EXP_BITS(b) & 1) << 3)
|
||||
const u64 idx = 8 ^ (i & 0xf);
|
||||
|
||||
data[i] = ppu_frsqrte_mantissas[idx] | exp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} ppu_frqrte_lut;
|
||||
|
@ -6534,10 +6534,13 @@ auto FRSQRTE()
|
||||
if constexpr (Build == 0xf1a6)
|
||||
return ppu_exec_select<Flags...>::template select<set_fpcc>();
|
||||
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
|
||||
ppu.fpr[op.frd] = 1.0 / std::sqrt(ppu.fpr[op.frb]);
|
||||
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
|
||||
{
|
||||
const u64 b = std::bit_cast<u64>(ppu.fpr[op.frb]);
|
||||
ppu.fpr[op.frd] = std::bit_cast<f64>(u64{ppu_frqrte_lut.data[b >> 49]} << 32);
|
||||
ppu_set_fpcc<Flags...>(ppu, ppu.fpr[op.frd], 0.);
|
||||
};
|
||||
|
||||
RETURN_(ppu, op);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifdef LLVM_AVAILABLE
|
||||
|
||||
#include "Emu/system_config.h"
|
||||
#include "Emu/Cell/Common.h"
|
||||
#include "PPUTranslator.h"
|
||||
#include "PPUThread.h"
|
||||
|
||||
@ -4344,8 +4345,14 @@ void PPUTranslator::FMUL(ppu_opcode_t op)
|
||||
|
||||
void PPUTranslator::FRSQRTE(ppu_opcode_t op)
|
||||
{
|
||||
const auto b = GetFpr(op.frb, 32);
|
||||
const auto result = m_ir->CreateFDiv(ConstantFP::get(GetType<f32>(), 1.0), Call(GetType<f32>(), "llvm.sqrt.f32", b));
|
||||
if (!m_frsqrte_table)
|
||||
{
|
||||
m_frsqrte_table = new GlobalVariable(*m_module, ArrayType::get(GetType<u32>(), 0x8000), true, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, ppu_frqrte_lut.data));
|
||||
}
|
||||
|
||||
const auto b = m_ir->CreateBitCast(GetFpr(op.frb), GetType<u64>());
|
||||
const auto v = m_ir->CreateLoad(m_ir->CreateGEP(m_frsqrte_table, {m_ir->getInt64(0), m_ir->CreateLShr(b, 49)}));
|
||||
const auto result = m_ir->CreateBitCast(m_ir->CreateShl(ZExt(v), 32), GetType<f64>());
|
||||
SetFpr(op.frd, result);
|
||||
|
||||
//m_ir->CreateStore(GetUndef<bool>(), m_fpscr_fr);
|
||||
|
@ -55,6 +55,7 @@ class PPUTranslator final : public cpu_translator
|
||||
llvm::StructType* m_thread_type;
|
||||
|
||||
llvm::Value* m_mtocr_table{};
|
||||
llvm::Value* m_frsqrte_table{};
|
||||
|
||||
llvm::Value* m_globals[175];
|
||||
llvm::Value** const m_g_cr = m_globals + 99;
|
||||
|
Loading…
Reference in New Issue
Block a user