mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-28 18:33:14 +00:00
Merge pull request #7428 from MerryMage/rm-j-GenFrsqrte
GenFrsqrte: Reduce branches in fast-path and inline most behavior
This commit is contained in:
commit
0e0fd18d5e
@ -32,17 +32,14 @@ void CommonAsmRoutines::GenFrsqrte()
|
||||
// This function clobbers all three RSCRATCH.
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
// Negative and zero inputs set an exception and take the complex path.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch zero = J_CC(CC_Z, true);
|
||||
FixupBranch negative = J_CC(CC_S, true);
|
||||
// Extract exponent
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||
|
||||
// Zero and max exponents (non-normal floats) take the complex path.
|
||||
FixupBranch complex1 = J_CC(CC_Z, true);
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||
FixupBranch complex2 = J_CC(CC_E, true);
|
||||
// Negatives, zeros, denormals, infinities and NaNs take the complex path.
|
||||
LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
|
||||
CMP(32, R(RSCRATCH2), Imm32(0x7FE));
|
||||
FixupBranch complex = J_CC(CC_AE, true);
|
||||
|
||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
||||
@ -75,24 +72,53 @@ void CommonAsmRoutines::GenFrsqrte()
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for zero input.
|
||||
SetJumpTarget(zero);
|
||||
SetJumpTarget(complex);
|
||||
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||
FixupBranch nan_or_inf = J_CC(CC_E);
|
||||
|
||||
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH2), Imm8(1));
|
||||
FixupBranch nonzero = J_CC(CC_NZ);
|
||||
|
||||
// +0.0 or -0.0
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||
FixupBranch complex3 = J();
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for negative input.
|
||||
// SNaN or QNaN or +Inf or -Inf
|
||||
SetJumpTarget(nan_or_inf);
|
||||
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH2), Imm8(12));
|
||||
FixupBranch inf = J_CC(CC_Z);
|
||||
BTS(64, R(RSCRATCH), Imm8(51));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH));
|
||||
RET();
|
||||
SetJumpTarget(inf);
|
||||
BT(64, R(RSCRATCH), Imm8(63));
|
||||
FixupBranch negative = J_CC(CC_C);
|
||||
XORPD(XMM0, R(XMM0));
|
||||
RET();
|
||||
|
||||
SetJumpTarget(nonzero);
|
||||
FixupBranch denormal = J_CC(CC_NC);
|
||||
|
||||
// Negative sign
|
||||
SetJumpTarget(negative);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
||||
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
||||
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
SetJumpTarget(skip_set_fx2);
|
||||
SetJumpTarget(complex1);
|
||||
SetJumpTarget(complex2);
|
||||
SetJumpTarget(complex3);
|
||||
MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
SetJumpTarget(denormal);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
|
@ -13,3 +13,7 @@ add_dolphin_test(DSPAssemblyTest
|
||||
add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
|
||||
|
||||
add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)
|
||||
|
||||
if(_M_X86)
|
||||
add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp)
|
||||
endif()
|
||||
|
101
Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
Normal file
101
Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
// Copyright 2018 Dolphin Emulator Project
|
||||
// Licensed under GPLv2+
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include "Common/BitUtils.h"
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Common/FloatUtils.h"
|
||||
#include "Common/x64ABI.h"
|
||||
#include "Core/PowerPC/Gekko.h"
|
||||
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||
#include "Core/PowerPC/Jit64Common/Jit64Base.h"
|
||||
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
class TestCommonAsmRoutines : public CommonAsmRoutines
|
||||
{
|
||||
public:
|
||||
TestCommonAsmRoutines()
|
||||
{
|
||||
using namespace Gen;
|
||||
|
||||
AllocCodeSpace(4096);
|
||||
m_const_pool.Init(AllocChildCodeSpace(1024), 1024);
|
||||
|
||||
const auto raw_frsqrte = reinterpret_cast<double (*)(double)>(AlignCode4());
|
||||
GenFrsqrte();
|
||||
|
||||
wrapped_frsqrte = reinterpret_cast<u64 (*)(u64, UReg_FPSCR&)>(AlignCode4());
|
||||
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||
|
||||
// We know the frsqrte implementation only accesses the fpscr. We manufacture a
|
||||
// PPCSTATE pointer so we read/write to our provided fpscr argument instead.
|
||||
XOR(32, R(RPPCSTATE), R(RPPCSTATE));
|
||||
LEA(64, RSCRATCH, PPCSTATE(fpscr));
|
||||
SUB(64, R(ABI_PARAM2), R(RSCRATCH));
|
||||
MOV(64, R(RPPCSTATE), R(ABI_PARAM2));
|
||||
|
||||
// Call
|
||||
MOVQ_xmm(XMM0, R(ABI_PARAM1));
|
||||
ABI_CallFunction(raw_frsqrte);
|
||||
MOVQ_xmm(R(ABI_RETURN), XMM0);
|
||||
|
||||
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||
RET();
|
||||
}
|
||||
|
||||
u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&);
|
||||
};
|
||||
|
||||
TEST(Jit64, Frsqrte)
|
||||
{
|
||||
TestCommonAsmRoutines routines;
|
||||
|
||||
const std::vector<u64> special_values{
|
||||
0x0000'0000'0000'0000, // positive zero
|
||||
0x0000'0000'0000'0001, // smallest positive denormal
|
||||
0x0000'0000'0100'0000,
|
||||
0x000F'FFFF'FFFF'FFFF, // largest positive denormal
|
||||
0x0010'0000'0000'0000, // smallest positive normal
|
||||
0x0010'0000'0000'0002,
|
||||
0x3FF0'0000'0000'0000, // 1.0
|
||||
0x7FEF'FFFF'FFFF'FFFF, // largest positive normal
|
||||
0x7FF0'0000'0000'0000, // positive infinity
|
||||
0x7FF0'0000'0000'0001, // first positive SNaN
|
||||
0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN
|
||||
0x7FF8'0000'0000'0000, // first positive QNaN
|
||||
0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN
|
||||
0x8000'0000'0000'0000, // negative zero
|
||||
0x8000'0000'0000'0001, // smallest negative denormal
|
||||
0x8000'0000'0100'0000,
|
||||
0x800F'FFFF'FFFF'FFFF, // largest negative denormal
|
||||
0x8010'0000'0000'0000, // smallest negative normal
|
||||
0x8010'0000'0000'0002,
|
||||
0xBFF0'0000'0000'0000, // -1.0
|
||||
0xFFEF'FFFF'FFFF'FFFF, // largest negative normal
|
||||
0xFFF0'0000'0000'0000, // negative infinity
|
||||
0xFFF0'0000'0000'0001, // first negative SNaN
|
||||
0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN
|
||||
0xFFF8'0000'0000'0000, // first negative QNaN
|
||||
0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN
|
||||
};
|
||||
|
||||
UReg_FPSCR fpscr;
|
||||
|
||||
for (u64 ivalue : special_values)
|
||||
{
|
||||
double dvalue = Common::BitCast<double>(ivalue);
|
||||
|
||||
u64 expected = Common::BitCast<u64>(Common::ApproximateReciprocalSquareRoot(dvalue));
|
||||
|
||||
u64 actual = routines.wrapped_frsqrte(ivalue, fpscr);
|
||||
|
||||
printf("%016llx -> %016llx == %016llx\n", ivalue, actual, expected);
|
||||
|
||||
EXPECT_EQ(expected, actual);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user