Rearchitect a bit of our AsmCommon routines.

JitCommon is becoming a cluster of x86 specific things and things that are common to multiple recompilers.
This overlap is beginning to cause us issues.

Begin by breaking out the common ASM arrays to have their own file and move the x86 specific routines to their own folder.
This commit is contained in:
Ryan Houdek 2015-01-25 01:51:18 +00:00
parent bf0293231f
commit 12b9ada268
9 changed files with 636 additions and 616 deletions

View File

@ -199,6 +199,7 @@ if(_M_X86)
PowerPC/Jit64/Jit_Paired.cpp
PowerPC/Jit64/JitRegCache.cpp
PowerPC/Jit64/Jit_SystemRegisters.cpp
PowerPC/Jit64Common/Jit64AsmCommon.cpp
PowerPC/JitCommon/JitBackpatch.cpp
PowerPC/JitCommon/Jit_Util.cpp
PowerPC/JitCommon/TrampolineCache.cpp)

View File

@ -235,6 +235,7 @@
<ClCompile Include="PowerPC\Jit64\Jit_LoadStorePaired.cpp" />
<ClCompile Include="PowerPC\Jit64\Jit_Paired.cpp" />
<ClCompile Include="PowerPC\Jit64\Jit_SystemRegisters.cpp" />
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitAsmCommon.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitBackpatch.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitBase.cpp" />
@ -417,6 +418,7 @@
<ClInclude Include="PowerPC\Jit64\JitRegCache.h" />
<ClInclude Include="PowerPC\JitILCommon\IR.h" />
<ClInclude Include="PowerPC\JitILCommon\JitILBase.h" />
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h" />
<ClInclude Include="PowerPC\JitCommon\JitAsmCommon.h" />
<ClInclude Include="PowerPC\JitCommon\JitBase.h" />
<ClInclude Include="PowerPC\JitCommon\JitCache.h" />

View File

@ -631,6 +631,9 @@
<ClCompile Include="PowerPC\SignatureDB.cpp">
<Filter>PowerPC</Filter>
</ClCompile>
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp">
<Filter>PowerPC\Jit64Common</Filter>
</ClCompile>
<ClCompile Include="PowerPC\JitCommon\Jit_Util.cpp">
<Filter>PowerPC\JitCommon</Filter>
</ClCompile>
@ -1184,6 +1187,9 @@
<ClInclude Include="PowerPC\SignatureDB.h">
<Filter>PowerPC</Filter>
</ClInclude>
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h">
<Filter>PowerPC\Jit64Common</Filter>
</ClInclude>
<ClInclude Include="PowerPC\JitCommon\Jit_Util.h">
<Filter>PowerPC\JitCommon</Filter>
</ClInclude>
@ -1229,4 +1235,4 @@
<ItemGroup>
<Text Include="CMakeLists.txt" />
</ItemGroup>
</Project>
</Project>

View File

@ -4,7 +4,7 @@
#pragma once
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
// code at runtime. In the case of fixed code like this, after writing it, we write

View File

@ -0,0 +1,601 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#include "Common/MathUtil.h"
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE \
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
})
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
// Assume value in RSCRATCH
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
RET();
}
void CommonAsmRoutines::GenFrsqrte()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
SHR(64, R(RSCRATCH), Imm8(37));
AND(32, R(RSCRATCH), Imm32(0x7FF));
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
// Exception flags for negative input.
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenFres()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
// Take the complex path for very large/small exponents.
FixupBranch complex1 = J_CC(CC_L);
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
FixupBranch complex2 = J_CC(CC_GE);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
NEG(32, R(RSCRATCH_EXTRA));
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(37));
SHR(64, R(RSCRATCH2), Imm8(47));
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
ADD(32, R(RSCRATCH), Imm8(1));
SHR(32, R(RSCRATCH), Imm8(1));
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
SUB(32, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(29));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenMfcr()
{
// Input: none
// Output: RSCRATCH
// This function clobbers all three RSCRATCH.
X64Reg dst = RSCRATCH;
X64Reg tmp = RSCRATCH2;
X64Reg cr_val = RSCRATCH_EXTRA;
XOR(32, R(dst), R(dst));
// we only need to zero the high bits of tmp once
XOR(32, R(tmp), R(tmp));
for (int i = 0; i < 8; i++)
{
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
if (i != 0)
SHL(32, R(dst), Imm8(4));
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
// EQ: Bits 31-0 == 0; set flag bit 1
TEST(32, R(cr_val), R(cr_val));
// FIXME: is there a better way to do this without the partial register merging?
SETcc(CC_Z, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
// GT: Value > 0; set flag bit 2
TEST(64, R(cr_val), R(cr_val));
SETcc(CC_G, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
// SO: Bit 61 set; set flag bit 0
// LT: Bit 62 set; set flag bit 3
SHR(64, R(cr_val), Imm8(61));
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
}
RET();
}
// Safe + Fast Quantizers, originally from JITIL by magumagu
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
#define QUANTIZE_OVERFLOW_SAFE
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
// while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
if (cpu_info.bSSE4_1)
{
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKUSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
}
else
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores()
{
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
}
void CommonAsmRoutines::GenQuantizedLoads()
{
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
RET();
const u8* loadPairedFloatOne = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
RET();
const u8* loadPairedU8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
}

View File

@ -0,0 +1,22 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#pragma once
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
{
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();
};

View File

@ -2,194 +2,7 @@
// Licensed under GPLv2
// Refer to the license.txt file included.
#include "Common/CPUDetect.h"
#include "Common/MathUtil.h"
#include "Common/MemoryUtil.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE \
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
})
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
// Assume value in RSCRATCH
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
RET();
}
void CommonAsmRoutines::GenFrsqrte()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
SHR(64, R(RSCRATCH), Imm8(37));
AND(32, R(RSCRATCH), Imm32(0x7FF));
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
// Exception flags for negative input.
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenFres()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
// Take the complex path for very large/small exponents.
FixupBranch complex1 = J_CC(CC_L);
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
FixupBranch complex2 = J_CC(CC_GE);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
NEG(32, R(RSCRATCH_EXTRA));
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(37));
SHR(64, R(RSCRATCH2), Imm8(47));
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
ADD(32, R(RSCRATCH), Imm8(1));
SHR(32, R(RSCRATCH), Imm8(1));
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
SUB(32, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(29));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenMfcr()
{
// Input: none
// Output: RSCRATCH
// This function clobbers all three RSCRATCH.
X64Reg dst = RSCRATCH;
X64Reg tmp = RSCRATCH2;
X64Reg cr_val = RSCRATCH_EXTRA;
XOR(32, R(dst), R(dst));
// we only need to zero the high bits of tmp once
XOR(32, R(tmp), R(tmp));
for (int i = 0; i < 8; i++)
{
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
if (i != 0)
SHL(32, R(dst), Imm8(4));
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
// EQ: Bits 31-0 == 0; set flag bit 1
TEST(32, R(cr_val), R(cr_val));
// FIXME: is there a better way to do this without the partial register merging?
SETcc(CC_Z, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
// GT: Value > 0; set flag bit 2
TEST(64, R(cr_val), R(cr_val));
SETcc(CC_G, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
// SO: Bit 61 set; set flag bit 0
// LT: Bit 62 set; set flag bit 3
SHR(64, R(cr_val), Imm8(61));
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
}
RET();
}
// Safe + Fast Quantizers, originally from JITIL by magumagu
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
@ -250,414 +63,4 @@ const float GC_ALIGNED16(m_dequantizeTableS[]) =
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
};
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
#define QUANTIZE_OVERFLOW_SAFE
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
// while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
if (cpu_info.bSSE4_1)
{
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKUSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
}
else
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores()
{
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
}
void CommonAsmRoutines::GenQuantizedLoads()
{
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
RET();
const u8* loadPairedFloatOne = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
RET();
const u8* loadPairedU8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
}

View File

@ -4,7 +4,7 @@
#pragma once
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Common/CommonTypes.h"
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
@ -15,7 +15,6 @@ extern const float GC_ALIGNED16(m_dequantizeTableS[]);
class CommonAsmRoutinesBase
{
public:
const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
@ -51,19 +50,5 @@ public:
// In: ECX: Address to write to.
// In: XMM0: Bottom 32-bit slot holds the float to be written.
const u8 **singleStoreQuantized;
};
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
{
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();
};

View File

@ -23,8 +23,8 @@
#include "Core/PowerPC/PowerPC.h"
#include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PPCTables.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitCache.h"
#include "Core/PowerPC/JitCommon/TrampolineCache.h"