diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index a4743e2b73..c361d9e625 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -203,7 +203,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg, { // Oh, RIP addressing. _offsetOrBaseReg = 5; - emit->WriteModRM(0, _operandReg&7, 5); + emit->WriteModRM(0, _operandReg, _offsetOrBaseReg); //TODO : add some checks #ifdef _M_X64 u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; @@ -327,7 +327,6 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg, } } - // W = operand extended width (1 if 64-bit) // R = register# upper bit // X = scale amnt upper bit @@ -1390,6 +1389,10 @@ void XEmitter::PSRLQ(X64Reg reg, int shift) { Write8(shift); } +void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { + WriteSSEOp(64, 0xd3, true, reg, arg); +} + void XEmitter::PSLLW(X64Reg reg, int shift) { WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg)); Write8(shift); @@ -1437,7 +1440,19 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) { Write8(0x0f); Write8(0x38); Write8(0x00); - arg.WriteRest(this, 0); + arg.WriteRest(this); +} + +void XEmitter::PTEST(X64Reg dest, OpArg arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PTEST on a system that doesn't support it. Nobody hears your screams."); + } + Write8(0x66); + Write8(0x0f); + Write8(0x38); + Write8(0x17); + arg.operandReg = dest; + arg.WriteRest(this); } void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} @@ -1458,7 +1473,7 @@ void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);} void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);} void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);} -void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} +void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFB, true, dest, arg);} void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);} void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);} @@ -1497,6 +1512,8 @@ void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);} void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);} void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);} +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);} +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);} // Prefixes @@ -1509,6 +1526,25 @@ void XEmitter::FWAIT() Write8(0x9B); } +// TODO: make this more generic +void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg) +{ + int mf = 0; + switch (bits) { + case 32: mf = 0; break; + case 64: mf = 2; break; + default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: bits is not 32 or 64"); + } + Write8(0xd9 | (mf << 1)); + // x87 instructions use the reg field of the ModR/M byte as opcode: + arg.WriteRest(this, 0, (X64Reg) op); +} + +void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);} +void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);} +void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);} +void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); } + void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); } // helper routines for setting pointers diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 43d28eb9ba..368939ed05 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -100,6 +100,12 @@ enum NormalOp { nrmXCHG, }; +enum FloatOp { + floatLD = 0, + floatST = 2, + floatSTP = 3, +}; + class XEmitter; // RIP addressing does not benefit from micro op fusion on Core arch @@ -118,6 +124,7 @@ struct OpArg void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const; void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const; + void WriteFloatModRM(XEmitter *emit, FloatOp op); void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); // This one is public - must be written to u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available. @@ -247,6 +254,7 @@ private: void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteFloatLoadStore(int bits, FloatOp op, OpArg arg); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); protected: @@ -427,6 +435,28 @@ public: void REP(); void REPNE(); + // x87 + enum x87StatusWordBits { + x87_InvalidOperation = 0x1, + x87_DenormalizedOperand = 0x2, + x87_DivisionByZero = 0x4, + x87_Overflow = 0x8, + x87_Underflow = 0x10, + x87_Precision = 0x20, + x87_StackFault = 0x40, + x87_ErrorSummary = 0x80, + x87_C0 = 0x100, + x87_C1 = 0x200, + x87_C2 = 0x400, + x87_TopOfStack = 0x2000 | 0x1000 | 0x800, + x87_C3 = 0x4000, + x87_FPUBusy = 0x8000, + }; + + void FLD(int bits, OpArg src); + void FST(int bits, OpArg dest); + void FSTP(int bits, OpArg dest); + void FNSTSW_AX(); void FWAIT(); // SSE/SSE2: Floating point arithmetic @@ -553,6 +583,7 @@ public: void PUNPCKLWD(X64Reg dest, const OpArg &arg); void PUNPCKLDQ(X64Reg dest, const OpArg &arg); + void PTEST(X64Reg dest, OpArg arg); void PAND(X64Reg dest, OpArg arg); void PANDN(X64Reg dest, OpArg arg); void PXOR(X64Reg dest, OpArg arg); @@ -608,6 +639,7 @@ public: void PSRLW(X64Reg reg, int shift); void PSRLD(X64Reg reg, int shift); void PSRLQ(X64Reg reg, int shift); + void PSRLQ(X64Reg reg, OpArg arg); void PSLLW(X64Reg reg, int shift); void PSLLD(X64Reg reg, int shift); @@ -622,6 +654,8 @@ public: void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); void RTDSC(); diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp index f3da4a0233..e7dd9db0f7 100644 --- a/Source/Core/Common/x64FPURoundMode.cpp +++ b/Source/Core/Common/x64FPURoundMode.cpp @@ -15,11 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10; #endif // OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register) -const u32 EXCEPTION_MASK = 0x1F80; +static const u32 EXCEPTION_MASK = 0x1F80; // Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0) -const u32 DAZ = 0x40; +static const u32 DAZ = 0x40; // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) -const u32 FTZ = 0x8000; +static const u32 FTZ = 0x8000; namespace FPURoundMode { @@ -100,8 +100,7 @@ namespace FPURoundMode FTZ, // flush-to-zero only FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported) }; - // FIXME: proper (?) non-IEEE mode emulation causes issues in lots of games - if (nonIEEEMode && false) + if (nonIEEEMode) { csr |= denormalLUT[cpu_info.bFlushToZero]; } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index 48e5cc8523..fa5e6ff2f4 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -231,3 +231,38 @@ inline u32 ConvertToSingleFTZ(u64 x) return (x >> 32) & 0x80000000; } } + +inline u64 ConvertToDouble(u32 _x) +{ + // This is a little-endian re-implementation of the algorithm described in + // the PowerPC Programming Environments Manual for loading single + // precision floating point numbers. + // See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf + + u64 x = _x; + u64 exp = (x >> 23) & 0xff; + u64 frac = x & 0x007fffff; + + if (exp > 0 && exp < 255) // Normal number + { + u64 y = !(exp >> 7); + u64 z = y << 61 | y << 60 | y << 59; + return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29); + } + else if (exp == 0 && frac != 0) // Subnormal number + { + exp = 1023 - 126; + do + { + frac <<= 1; + exp -= 1; + } while ((frac & 0x00800000) == 0); + return ((x & 0x80000000) << 32) | (exp << 52) | ((frac & 0x007fffff) << 29); + } + else // QNaN, SNaN or Zero + { + u64 y = exp >> 7; + u64 z = y << 61 | y << 60 | y << 59; + return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29); + } +} diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp index 46013597fc..fcaa010f35 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -92,9 +92,9 @@ void Interpreter::lfs(UGeckoInstruction _inst) u32 uTemp = Memory::Read_U32(Helper_Get_EA(_inst)); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { - double value = *(float*)&uTemp; - rPS0(_inst.FD) = value; - rPS1(_inst.FD) = value; + u64 value = ConvertToDouble(uTemp); + riPS0(_inst.FD) = value; + riPS1(_inst.FD) = value; } } @@ -104,9 +104,9 @@ void Interpreter::lfsu(UGeckoInstruction _inst) u32 uTemp = Memory::Read_U32(uAddress); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { - double value = *(float*)&uTemp; - rPS0(_inst.FD) = value; - rPS1(_inst.FD) = value; + u64 value = ConvertToDouble(uTemp); + riPS0(_inst.FD) = value; + riPS1(_inst.FD) = value; m_GPR[_inst.RA] = uAddress; } @@ -118,9 +118,9 @@ void Interpreter::lfsux(UGeckoInstruction _inst) u32 uTemp = Memory::Read_U32(uAddress); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { - double value = *(float*)&uTemp; - rPS0(_inst.FD) = value; - rPS1(_inst.FD) = value; + u64 value = ConvertToDouble(uTemp); + riPS0(_inst.FD) = value; + riPS1(_inst.FD) = value; m_GPR[_inst.RA] = uAddress; } } @@ -130,9 +130,9 @@ void Interpreter::lfsx(UGeckoInstruction _inst) u32 uTemp = Memory::Read_U32(Helper_Get_EA_X(_inst)); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { - double value = *(float*)&uTemp; - rPS0(_inst.FD) = value; - rPS1(_inst.FD) = value; + u64 value = ConvertToDouble(uTemp); + riPS0(_inst.FD) = value; + riPS1(_inst.FD) = value; } } @@ -281,9 +281,6 @@ void Interpreter::stfdu(UGeckoInstruction _inst) void Interpreter::stfs(UGeckoInstruction _inst) { - //double value = rPS0(_inst.FS); - //float fTemp = (float)value; - //Memory::Write_U32(*(u32*)&fTemp, Helper_Get_EA(_inst)); Memory::Write_U32(ConvertToSingle(riPS0(_inst.FS)), Helper_Get_EA(_inst)); } diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index 73017b5ebd..05417bfcb2 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -374,7 +374,7 @@ void RegCache::Flush(FlushMode mode) { if (locks[i]) { - PanicAlert("Someone forgot to unlock PPC reg %i.", i); + PanicAlert("Someone forgot to unlock PPC reg %i (X64 reg %i).", i, RX(i)); } if (regs[i].away) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 5429cc7075..dac3b9aad2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -12,6 +12,8 @@ #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" +namespace { + // pshufb todo: MOVQ const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; @@ -19,11 +21,10 @@ const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0}; const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; -namespace { - u64 GC_ALIGNED16(temp64); -u32 GC_ALIGNED16(temp32); + } + // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common, // and pshufb could help a lot. // Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves. @@ -46,11 +47,9 @@ void Jit64::lfs(UGeckoInstruction inst) MEMCHECK_START - MOV(32, M(&temp32), R(EAX)); fpr.Lock(d); fpr.BindToRegister(d, false); - CVTSS2SD(fpr.RX(d), M(&temp32)); - MOVDDUP(fpr.RX(d), fpr.R(d)); + ConvertSingleToDouble(fpr.RX(d), EAX, true); MEMCHECK_END @@ -226,13 +225,15 @@ void Jit64::stfs(UGeckoInstruction inst) return; } + fpr.BindToRegister(s, true, false); + ConvertDoubleToSingle(XMM0, fpr.RX(s)); + if (gpr.R(a).IsImm()) { u32 addr = (u32)(gpr.R(a).offset + offset); if (Memory::IsRAMAddress(addr)) { if (cpu_info.bSSSE3) { - CVTSD2SS(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void *)bswapShuffle1x4)); WriteFloatToConstRamAddress(XMM0, addr); return; @@ -241,7 +242,6 @@ void Jit64::stfs(UGeckoInstruction inst) else if (addr == 0xCC008000) { // Float directly to write gather pipe! Fun! - CVTSD2SS(XMM0, fpr.R(s)); CALL((void*)asm_routines.fifoDirectWriteFloat); // TODO js.fifoBytesThisBlock += 4; @@ -251,7 +251,6 @@ void Jit64::stfs(UGeckoInstruction inst) gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); gpr.Lock(a); - fpr.Lock(s); MOV(32, R(ABI_PARAM2), gpr.R(a)); ADD(32, R(ABI_PARAM2), Imm32(offset)); if (update && offset) @@ -266,7 +265,6 @@ void Jit64::stfs(UGeckoInstruction inst) MEMCHECK_END } - CVTSD2SS(XMM0, fpr.R(s)); SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse()); gpr.UnlockAll(); gpr.UnlockAllX(); @@ -281,11 +279,14 @@ void Jit64::stfsx(UGeckoInstruction inst) // We can take a shortcut here - it's not likely that a hardware access would use this instruction. gpr.FlushLockX(ABI_PARAM1); - fpr.Lock(inst.RS); MOV(32, R(ABI_PARAM1), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(ABI_PARAM1), gpr.R(inst.RA)); - CVTSD2SS(XMM0, fpr.R(inst.RS)); + + int s = inst.RS; + fpr.Lock(s); + fpr.BindToRegister(s, true, false); + ConvertDoubleToSingle(XMM0, fpr.RX(s)); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse()); @@ -304,21 +305,20 @@ void Jit64::lfsx(UGeckoInstruction inst) { ADD(32, R(EAX), gpr.R(inst.RA)); } + fpr.Lock(inst.RS); + fpr.BindToRegister(inst.RS, false); + X64Reg s = fpr.RX(inst.RS); if (cpu_info.bSSSE3 && !js.memcheck) { - fpr.Lock(inst.RS); - fpr.BindToRegister(inst.RS, false, true); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); #ifdef _M_IX86 AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOVD_xmm(r, MDisp(EAX, (u32)Memory::base)); + MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base)); #else - MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0)); + MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0)); #endif MEMCHECK_START - PSHUFB(r, M((void *)bswapShuffle1x4)); - CVTSS2SD(r, R(r)); - MOVDDUP(r, R(r)); + PSHUFB(XMM0, M((void *)bswapShuffle1x4)); + ConvertSingleToDouble(s, XMM0); MEMCHECK_END } else { @@ -326,11 +326,7 @@ void Jit64::lfsx(UGeckoInstruction inst) MEMCHECK_START - MOV(32, M(&temp32), R(EAX)); - CVTSS2SD(XMM0, M(&temp32)); - fpr.Lock(inst.RS); - fpr.BindToRegister(inst.RS, false, true); - MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0)); + ConvertSingleToDouble(s, EAX, true); MEMCHECK_END } diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index dca36196bf..8a0843eb67 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1290,10 +1290,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { } case DupSingleToMReg: { if (!thisUsed) break; - X64Reg reg = fregURegWithoutMov(RI, I); - Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I))); - Jit->MOVDDUP(reg, R(reg)); - RI.fregs[reg] = I; + + X64Reg input = fregEnsureInReg(RI, getOp1(I)); + X64Reg output = fregURegWithoutMov(RI, I); + Jit->ConvertSingleToDouble(output, input); + + RI.fregs[output] = I; fregNormalRegClear(RI, I); break; } @@ -1414,9 +1416,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { } case DoubleToSingle: { if (!thisUsed) break; - X64Reg reg = fregURegWithoutMov(RI, I); - Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I))); - RI.fregs[reg] = I; + + X64Reg input = fregEnsureInReg(RI, getOp1(I)); + X64Reg output = fregURegWithoutMov(RI, I); + Jit->ConvertDoubleToSingle(output, input); + + RI.fregs[output] = I; fregNormalRegClear(RI, I); break; } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index ec03303a97..cc1faca78a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -416,6 +416,200 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) { } } +static u32 GC_ALIGNED16(temp32); +static u64 GC_ALIGNED16(temp64); +#ifdef _WIN32 +#include +#ifdef _M_X64 +static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x0000000000400000); +static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x000000007f800000); +static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0, 0x0008000000000000); +static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000); +#else +static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi32(0, 0, 0x00000000, 0x00400000); +static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi32(0, 0, 0x00000000, 0x7f800000); +static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi32(0, 0, 0x00080000, 0x00000000); +static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi32(0, 0, 0x7ff00000, 0x00000000); +#endif +#else +static const __uint128_t GC_ALIGNED16(single_qnan_bit) = 0x0000000000400000; +static const __uint128_t GC_ALIGNED16(single_exponent) = 0x000000007f800000; +static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000; +static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000; +#endif + +// Since the following float conversion functions are used in non-arithmetic PPC float instructions, +// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs. +// This means we can't use CVTSS2SD/CVTSD2SS :( +// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals. +// If the number is a NaN, make sure to set the QNaN bit back to its original value. + +// Another problem is that officially, converting doubles to single format results in undefined behavior. +// Relying on undefined behavior is a bug so no software should ever do this. +// In case it does happen, phire's more accurate implementation of ConvertDoubleToSingle() is reproduced below. + +//#define MORE_ACCURATE_DOUBLETOSINGLE +#ifdef MORE_ACCURATE_DOUBLETOSINGLE + +#ifdef _WIN32 +#ifdef _M_X64 +static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi64x(0, 0x000fffffffffffff); +static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi64x(0, 0x8000000000000000); +static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi64x(0, 0x0010000000000000); +static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi64x(0, 0xc000000000000000); +static const __m128i GC_ALIGNED16(double_bottom_bits) = _mm_set_epi64x(0, 0x07ffffffe0000000); +#else +static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi32(0, 0, 0x000fffff, 0xffffffff); +static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi32(0, 0, 0x80000000, 0x00000000); +static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi32(0, 0, 0x00100000, 0x00000000); +static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi32(0, 0, 0xc0000000, 0x00000000); +static const __m128i GC_ALIGNED16(double_bottom_bits) = _mm_set_epi32(0, 0, 0x07ffffff, 0xe0000000); +#endif +#else +static const __uint128_t GC_ALIGNED16(double_fraction) = 0x000fffffffffffff; +static const __uint128_t GC_ALIGNED16(double_sign_bit) = 0x8000000000000000; +static const __uint128_t GC_ALIGNED16(double_explicit_top_bit) = 0x0010000000000000; +static const __uint128_t GC_ALIGNED16(double_top_two_bits) = 0xc000000000000000; +static const __uint128_t GC_ALIGNED16(double_bottom_bits) = 0x07ffffffe0000000; +#endif + +// This is the same algorithm used in the interpreter (and actual hardware) +// The documentation states that the conversion of a double with an outside the +// valid range for a single (or a single denormal) is undefined. +// But testing on actual hardware shows it always picks bits 0..1 and 5..34 +// unless the exponent is in the range of 874 to 896. +void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src) +{ + MOVSD(XMM1, R(src)); + + // Grab Exponent + PAND(XMM1, M((void *)&double_exponent)); + PSRLQ(XMM1, 52); + MOVD_xmm(R(EAX), XMM1); + + + // Check if the double is in the range of valid single subnormal + CMP(16, R(EAX), Imm16(896)); + FixupBranch NoDenormalize = J_CC(CC_G); + CMP(16, R(EAX), Imm16(874)); + FixupBranch NoDenormalize2 = J_CC(CC_L); + + // Denormalise + + // shift = (905 - Exponent) plus the 21 bit double to single shift + MOV(16, R(EAX), Imm16(905 + 21)); + MOVD_xmm(XMM0, R(EAX)); + PSUBQ(XMM0, R(XMM1)); + + // xmm1 = fraction | 0x0010000000000000 + MOVSD(XMM1, R(src)); + PAND(XMM1, M((void *)&double_fraction)); + POR(XMM1, M((void *)&double_explicit_top_bit)); + + // fraction >> shift + PSRLQ(XMM1, R(XMM0)); + + // OR the sign bit in. + MOVSD(XMM0, R(src)); + PAND(XMM0, M((void *)&double_sign_bit)); + PSRLQ(XMM0, 32); + POR(XMM1, R(XMM0)); + + FixupBranch end = J(false); // Goto end + + SetJumpTarget(NoDenormalize); + SetJumpTarget(NoDenormalize2); + + // Don't Denormalize + + // We want bits 0, 1 + MOVSD(XMM1, R(src)); + PAND(XMM1, M((void *)&double_top_two_bits)); + PSRLQ(XMM1, 32); + + // And 5 through to 34 + MOVSD(XMM0, R(src)); + PAND(XMM0, M((void *)&double_bottom_bits)); + PSRLQ(XMM0, 29); + + // OR them togther + POR(XMM1, R(XMM0)); + + // End + SetJumpTarget(end); + MOVDDUP(dst, R(XMM1)); +} + +#else // MORE_ACCURATE_DOUBLETOSINGLE + +void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src) +{ + MOVSD(M(&temp64), src); + MOVSD(XMM1, R(src)); + FLD(64, M(&temp64)); + CCFlags cond; + if (cpu_info.bSSE4_1) { + PTEST(XMM1, M((void *)&double_exponent)); + cond = CC_NC; + } else { + FNSTSW_AX(); + TEST(16, R(AX), Imm16(x87_InvalidOperation)); + cond = CC_Z; + } + FSTP(32, M(&temp32)); + MOVSS(XMM0, M(&temp32)); + FixupBranch dont_reset_qnan_bit = J_CC(cond); + + PANDN(XMM1, M((void *)&double_qnan_bit)); + PSRLQ(XMM1, 29); + if (cpu_info.bAVX) { + VPANDN(XMM0, XMM1, R(XMM0)); + } else { + PANDN(XMM1, R(XMM0)); + MOVSS(XMM0, R(XMM1)); + } + + SetJumpTarget(dont_reset_qnan_bit); + MOVDDUP(dst, R(XMM0)); +} +#endif // MORE_ACCURATE_DOUBLETOSINGLE + +void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr) +{ + if (src_is_gpr) { + MOV(32, M(&temp32), R(src)); + MOVD_xmm(XMM1, R(src)); + } else { + MOVSS(M(&temp32), src); + MOVSS(R(XMM1), src); + } + FLD(32, M(&temp32)); + CCFlags cond; + if (cpu_info.bSSE4_1) { + PTEST(XMM1, M((void *)&single_exponent)); + cond = CC_NC; + } else { + FNSTSW_AX(); + TEST(16, R(AX), Imm16(x87_InvalidOperation)); + cond = CC_Z; + } + FSTP(64, M(&temp64)); + MOVSD(dst, M(&temp64)); + FixupBranch dont_reset_qnan_bit = J_CC(cond); + + PANDN(XMM1, M((void *)&single_qnan_bit)); + PSLLQ(XMM1, 29); + if (cpu_info.bAVX) { + VPANDN(dst, XMM1, R(dst)); + } else { + PANDN(XMM1, R(dst)); + MOVSD(dst, R(XMM1)); + } + + SetJumpTarget(dont_reset_qnan_bit); + MOVDDUP(dst, R(dst)); +} + void EmuCodeBlock::JitClearCA() { AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 67466843ad..52d0b12d39 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -48,6 +48,10 @@ public: void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); + + // AX might get trashed + void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false); + void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); protected: std::unordered_map registersInUseAtLoc; };