From 07e0c917c679f3170a3057fad68bed05b87b754d Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 5 Sep 2014 10:26:30 -0700 Subject: [PATCH] Revert "JIT64: optimize CA calculations" --- Source/Core/Common/MathUtil.h | 7 +- Source/Core/Core/PowerPC/Gekko.h | 9 +- .../Interpreter/Interpreter_Tables.cpp | 8 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 14 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 4 +- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 5 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 911 +++++++++--------- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 27 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 5 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 42 +- Source/Core/Core/PowerPC/PPCAnalyst.h | 4 +- Source/Core/DolphinWX/GameListCtrl.cpp | 2 +- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 2 +- .../VideoCommon/TextureConversionShader.cpp | 6 +- Source/UnitTests/Common/MathUtilTest.cpp | 16 +- 16 files changed, 534 insertions(+), 530 deletions(-) diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h index 143398d6f7..9c0fe2c884 100644 --- a/Source/Core/Common/MathUtil.h +++ b/Source/Core/Common/MathUtil.h @@ -175,15 +175,16 @@ struct Rectangle } // namespace MathUtil +inline float pow2f(float x) {return x * x;} +inline double pow2(double x) {return x * x;} + float MathFloatVectorSum(const std::vector&); #define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) -inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;} - // Rounds down. 0 -> undefined -inline int IntLog2(u64 val) +inline int Log2(u64 val) { #if defined(__GNUC__) return 63 - __builtin_clzll(val); diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h index 3a97d96472..1a9e97b559 100644 --- a/Source/Core/Core/PowerPC/Gekko.h +++ b/Source/Core/Core/PowerPC/Gekko.h @@ -331,12 +331,9 @@ union UFPR float f[2]; }; -#define XER_CA_SHIFT 29 -#define XER_OV_SHIFT 30 -#define XER_SO_SHIFT 31 -#define XER_CA_MASK (1U << XER_CA_SHIFT) -#define XER_OV_MASK (1U << XER_OV_SHIFT) -#define XER_SO_MASK (1U << XER_SO_SHIFT) +#define XER_CA_MASK 0x20000000 +#define XER_OV_MASK 0x40000000 +#define XER_SO_MASK 0x80000000 // XER union UReg_XER { diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 317132266d..2bf66ae99b 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] = {10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}}, - {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}}, + {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}}, {14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, {15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, @@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] = {922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, - {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, + {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}}, @@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] = {339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}}, {467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}}, {371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}}, - {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}}, + {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}}, {595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, {659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 17051e67a9..c0b5c73260 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -100,15 +100,13 @@ public: void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); - void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false); + void FinalizeCarryOverflow(bool oe, bool inv = false); + void GetCarryEAXAndClear(); + void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false); + void GenerateCarry(); + void GenerateRC(); void ComputeRC(const Gen::OpArg & arg); - // use to extract bytes from a register using the regcache. offset is in bytes. - Gen::OpArg ExtractFromReg(int reg, int offset); - void AndWithMask(Gen::X64Reg reg, u32 mask); - bool CheckMergedBranch(int crf); - void DoMergedBranch(); - // Reads a given bit of a given CR register part. Clobbers ABI_PARAM1, // don't forget to xlock it before. void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); @@ -120,8 +118,6 @@ public: Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); - void MultiplyImmediate(u32 imm, int a, int d, bool overflow); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 20fd761be8..fa7c19aec8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] = {922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, - {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, + {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, + {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}}, diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index a1979e38bc..e2e0ed6a6c 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -314,10 +314,7 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode) void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc) { - if (regs[preg].location.IsImm() && !regs[preg].location.offset) - emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc)); - else - emit->MOV(32, ::Gen::R(newLoc), regs[preg].location); + emit->MOV(32, ::Gen::R(newLoc), regs[preg].location); } void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 0bddbec8a1..bd904f1cc5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -5,7 +5,6 @@ #include #include -#include "Common/MathUtil.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" @@ -31,7 +30,6 @@ void Jit64::GenerateConstantOverflow(bool overflow) } } -// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer. void Jit64::GenerateOverflow() { FixupBranch jno = J_CC(CC_NO); @@ -45,31 +43,87 @@ void Jit64::GenerateOverflow() } // Assumes CA,OV are clear -void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv) +void Jit64::FinalizeCarryOverflow(bool oe, bool inv) { // USES_XER if (oe) { - // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both - // sides of the branch. FixupBranch jno = J_CC(CC_NO); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); + // Do carry + FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); + JitSetCA(); + SetJumpTarget(carry1); //XER[OV/SO] = 1 OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); FixupBranch exit = J(); SetJumpTarget(jno); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); + // Do carry + FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); + JitSetCA(); + SetJumpTarget(carry2); SetJumpTarget(exit); } - else if (ca) + else { // Do carry - JitSetCAIf(inv ? CC_NC : CC_C); + FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); + JitSetCA(); + SetJumpTarget(carry1); } } +void Jit64::GetCarryEAXAndClear() +{ + MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); + BTR(32, R(EAX), Imm8(29)); +} + +// Assumes that XER is in EAX and that the CA bit is clear. +void Jit64::FinalizeCarryGenerateOverflowEAX(bool oe, bool inv) +{ + // USES_XER + if (oe) + { + FixupBranch jno = J_CC(CC_NO); + // Do carry + FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); + OR(32, R(EAX), Imm32(XER_CA_MASK)); + SetJumpTarget(carry1); + //XER[OV/SO] = 1 + OR(32, R(EAX), Imm32(XER_SO_MASK | XER_OV_MASK)); + FixupBranch exit = J(); + SetJumpTarget(jno); + // Do carry + FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); + OR(32, R(EAX), Imm32(XER_CA_MASK)); + SetJumpTarget(carry2); + //XER[OV] = 0 + AND(32, R(EAX), Imm32(~XER_OV_MASK)); + SetJumpTarget(exit); + } + else + { + // Do carry + FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); + OR(32, R(EAX), Imm32(XER_CA_MASK)); + SetJumpTarget(carry1); + } + // Dump EAX back into XER + MOV(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); +} + +// Assumes that the flags were just set through an addition. +void Jit64::GenerateCarry() +{ + // USES_XER + FixupBranch pNoCarry = J_CC(CC_NC); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); + FixupBranch pContinue = J(); + SetJumpTarget(pNoCarry); + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(XER_CA_MASK))); + SetJumpTarget(pContinue); +} + void Jit64::ComputeRC(const Gen::OpArg & arg) { if (arg.IsImm()) @@ -83,30 +137,6 @@ void Jit64::ComputeRC(const Gen::OpArg & arg) } } -OpArg Jit64::ExtractFromReg(int reg, int offset) -{ - OpArg src = gpr.R(reg); - // store to load forwarding should handle this case efficiently - if (offset) - { - gpr.StoreFromRegister(reg, FLUSH_MAINTAIN_STATE); - src = gpr.GetDefaultLocation(reg); - src.offset += offset; - } - return src; -} - -// we can't do this optimization in the emitter because MOVZX and AND have different effects on flags. -void Jit64::AndWithMask(X64Reg reg, u32 mask) -{ - if (mask == 0xff) - MOVZX(32, 8, reg, R(reg)); - else if (mask == 0xffff) - MOVZX(32, 16, reg, R(reg)); - else - AND(32, R(reg), Imm32(mask)); -} - // Following static functions are used in conjunction with regimmop static u32 Add(u32 a, u32 b) { @@ -131,36 +161,35 @@ static u32 Xor(u32 a, u32 b) void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry) { gpr.Lock(d, a); - carry &= js.op->wantsCA; if (a || binary || carry) // yeh nasty special case addic { - JitClearCAOV(carry, false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); + if (Rc) + { + ComputeRC(gpr.R(d)); + } } else if (a == d) { gpr.KillImmediate(d, true, true); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + if (carry) + GenerateCarry(); + if (Rc) + ComputeRC(gpr.R(d)); } else { gpr.BindToRegister(d, false); - if (doop == Add && gpr.R(a).IsSimpleReg() && !carry) - { - LEA(32, gpr.RX(d), MDisp(gpr.RX(a), value)); - } - else - { - MOV(32, gpr.R(d), gpr.R(a)); - (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; - } + MOV(32, gpr.R(d), gpr.R(a)); + (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + if (carry) + GenerateCarry(); + if (Rc) + ComputeRC(gpr.R(d)); } - if (carry) - JitSetCAIf(CC_C); - if (Rc) - ComputeRC(gpr.R(d)); } else if (doop == Add) { @@ -266,56 +295,6 @@ void Jit64::reg_imm(UGeckoInstruction inst) } } -bool Jit64::CheckMergedBranch(int crf) -{ - const UGeckoInstruction& next = js.next_inst; - if (((next.OPCD == 16 /* bcx */) || - ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) || - ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) && - (next.BO & BO_DONT_DECREMENT_FLAG) && - !(next.BO & BO_DONT_CHECK_CONDITION) && - (next.BI >> 2) == crf) - return true; - return false; -} - -void Jit64::DoMergedBranch() -{ - // Code that handles successful PPC branching. - if (js.next_inst.OPCD == 16) // bcx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - - u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); - else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - MOV(32, R(EAX), M(&CTR)); - AND(32, R(EAX), Imm32(0xFFFFFFFC)); - WriteExitDestInEAX(); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx - { - MOV(32, R(EAX), M(&LR)); - AND(32, R(EAX), Imm32(0xFFFFFFFC)); - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - WriteExitDestInEAX(); - } - else - { - PanicAlert("WTF invalid branch"); - } -} - void Jit64::cmpXX(UGeckoInstruction inst) { // USES_CR @@ -324,7 +303,23 @@ void Jit64::cmpXX(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; int crf = inst.CRFD; - bool merge_branch = CheckMergedBranch(crf); + + bool merge_branch = false; + int test_crf = js.next_inst.BI >> 2; + // Check if the next instruction is a branch - if it is, merge the two. + if (((js.next_inst.OPCD == 16 /* bcx */) || + ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528) /* bcctrx */) || + ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16) /* bclrx */)) && + (js.next_inst.BO & BO_DONT_DECREMENT_FLAG) && + !(js.next_inst.BO & BO_DONT_CHECK_CONDITION)) + { + // Looks like a decent conditional branch that we can merge with. + // It only test CR, not CTR. + if (test_crf == crf) + { + merge_branch = true; + } + } OpArg comparand; bool signedCompare; @@ -394,13 +389,45 @@ void Jit64::cmpXX(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); - DoMergedBranch(); + + if (js.next_inst.OPCD == 16) // bcx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + + u32 destination; + if (js.next_inst.AA) + destination = SignExt16(js.next_inst.BD << 2); + else + destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); + WriteExit(destination); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + MOV(32, R(EAX), M(&CTR)); + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + WriteExitDestInEAX(); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx + { + MOV(32, R(EAX), M(&LR)); + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + WriteExitDestInEAX(); + } + else + { + PanicAlert("WTF invalid branch"); + } } - else if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) + else { - gpr.Flush(); - fpr.Flush(); - WriteExit(js.next_compilerPC + 4); + if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) + { + WriteExit(js.next_compilerPC + 4); + } } } } @@ -427,29 +454,13 @@ void Jit64::cmpXX(UGeckoInstruction inst) MOVZX(64, 32, RAX, gpr.R(a)); if (comparand.IsImm()) - { - // sign extension will ruin this, so store it in a register - if (comparand.offset & 0x80000000U) - { - MOV(32, R(ABI_PARAM1), comparand); - comparand = R(ABI_PARAM1); - } - } + MOV(32, R(ABI_PARAM1), comparand); else - { MOVZX(64, 32, ABI_PARAM1, comparand); - comparand = R(ABI_PARAM1); - } - } - if (comparand.IsImm() && !comparand.offset) - { - if (merge_branch) - TEST(64, R(RAX), R(RAX)); - } - else - { - SUB(64, R(RAX), comparand); + + comparand = R(ABI_PARAM1); } + SUB(64, R(RAX), comparand); MOV(64, M(&PowerPC::ppcState.cr_val[crf]), R(RAX)); if (merge_branch) @@ -477,12 +488,51 @@ void Jit64::cmpXX(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - DoMergedBranch(); + // Code that handles successful PPC branching. + if (js.next_inst.OPCD == 16) // bcx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + + u32 destination; + if (js.next_inst.AA) + destination = SignExt16(js.next_inst.BD << 2); + else + destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); + WriteExit(destination); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + + MOV(32, R(EAX), M(&CTR)); + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + WriteExitDestInEAX(); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx + { + MOV(32, R(EAX), M(&LR)); + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + + WriteExitDestInEAX(); + } + else + { + PanicAlert("WTF invalid branch"); + } SetJumpTarget(pDontBranch); if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) + { + gpr.Flush(); + fpr.Flush(); WriteExit(js.next_compilerPC + 4); + } } } @@ -705,7 +755,11 @@ void Jit64::extsbx(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - MOVSX(32, 8, gpr.RX(a), gpr.R(s)); + // Always force moving to EAX because it isn't possible + // to refer to the lowest byte of some registers, at least in + // 32-bit mode. + MOV(32, R(EAX), gpr.R(s)); + MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends gpr.UnlockAll(); } @@ -728,7 +782,11 @@ void Jit64::extshx(UGeckoInstruction inst) else { gpr.Lock(a, s); + gpr.KillImmediate(s, true, false); gpr.BindToRegister(a, a == s, true); + // This looks a little dangerous, but it's safe because + // every 32-bit register has a 16-bit half at the same index + // as the 32-bit register. MOVSX(32, 16, gpr.RX(a), gpr.R(s)); gpr.UnlockAll(); } @@ -751,38 +809,40 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCAOV(js.op->wantsCA, false); + JitClearCA(); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FixupBranch carry1 = J_CC(CC_C); + JitSetCA(); + SetJumpTarget(carry1); } else if (imm == -1) { // CA is always set in this case - if (js.op->wantsCA) - JitSetCA(); + JitSetCA(); NOT(32, gpr.R(d)); } else { - JitClearCAOV(js.op->wantsCA, false); + JitClearCA(); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - if (js.op->wantsCA) - JitSetCAIf(CC_C); + FixupBranch carry1 = J_CC(CC_NC); + JitSetCA(); + SetJumpTarget(carry1); } } else { - JitClearCAOV(js.op->wantsCA, false); + JitClearCA(); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FixupBranch carry1 = J_CC(CC_C); + JitSetCA(); + SetJumpTarget(carry1); } gpr.UnlockAll(); // This instruction has no RC flag @@ -795,7 +855,8 @@ void Jit64::subfcx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, d = inst.RD; gpr.Lock(a, b, d); gpr.BindToRegister(d, (d == a || d == b), true); - JitClearCAOV(js.op->wantsCA, inst.OE); + + JitClearCAOV(inst.OE); if (d == b) { SUB(32, gpr.R(d), gpr.R(a)); @@ -813,7 +874,7 @@ void Jit64::subfcx(UGeckoInstruction inst) } if (inst.Rc) ComputeRC(gpr.R(d)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, true); + FinalizeCarryOverflow(inst.OE, true); gpr.UnlockAll(); } @@ -826,7 +887,7 @@ void Jit64::subfex(UGeckoInstruction inst) gpr.Lock(a, b, d); gpr.BindToRegister(d, (d == a || d == b), true); - JitGetAndClearCAOV(inst.OE); + GetCarryEAXAndClear(); bool invertedCarry = false; if (d == b) @@ -847,7 +908,7 @@ void Jit64::subfex(UGeckoInstruction inst) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), gpr.R(b)); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry); + FinalizeCarryGenerateOverflowEAX(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -863,12 +924,14 @@ void Jit64::subfmex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); + GetCarryEAXAndClear(); if (d != a) + { MOV(32, gpr.R(d), gpr.R(a)); + } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); + FinalizeCarryGenerateOverflowEAX(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -884,12 +947,14 @@ void Jit64::subfzex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); + GetCarryEAXAndClear(); if (d != a) + { MOV(32, gpr.R(d), gpr.R(a)); + } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); + FinalizeCarryGenerateOverflowEAX(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -907,9 +972,13 @@ void Jit64::subfx(UGeckoInstruction inst) s32 i = (s32)gpr.R(b).offset, j = (s32)gpr.R(a).offset; gpr.SetImmediate32(d, i - j); if (inst.Rc) + { ComputeRC(gpr.R(d)); + } if (inst.OE) + { GenerateConstantOverflow((s64)i - (s64)j); + } } else { @@ -938,64 +1007,6 @@ void Jit64::subfx(UGeckoInstruction inst) } } -void Jit64::MultiplyImmediate(u32 imm, int a, int d, bool overflow) -{ - // simplest cases first - if (imm == 0) - { - XOR(32, gpr.R(d), gpr.R(d)); - return; - } - - if (imm == (u32)-1) - { - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - NEG(32, gpr.R(d)); - return; - } - - // skip these if we need to check overflow flag - if (!overflow) - { - // power of 2; just a shift - if (IsPow2(imm)) - { - u32 shift = IntLog2(imm); - // use LEA if it saves an op - if (d != a && shift <= 3 && shift >= 1 && gpr.R(a).IsSimpleReg()) - { - LEA(32, gpr.RX(d), MScaled(gpr.RX(a), SCALE_1 << shift, 0)); - } - else - { - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - if (shift) - SHL(32, gpr.R(d), Imm8(shift)); - } - return; - } - - // We could handle factors of 2^N*3, 2^N*5, and 2^N*9 using lea+shl, but testing shows - // it seems to be slower overall. - static u8 lea_scales[3] = { 3, 5, 9 }; - for (int i = 0; i < 3; i++) - { - if (imm == lea_scales[i]) - { - if (d != a) - gpr.BindToRegister(a, true, false); - LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(a), SCALE_2 << i, 0)); - return; - } - } - } - - // if we didn't find any better options - IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); -} - void Jit64::mulli(UGeckoInstruction inst) { INSTRUCTION_START @@ -1011,7 +1022,46 @@ void Jit64::mulli(UGeckoInstruction inst) { gpr.Lock(a, d); gpr.BindToRegister(d, (d == a), true); - MultiplyImmediate(imm, a, d, false); + if (imm == 0) + { + XOR(32, gpr.R(d), gpr.R(d)); + } + else if (imm == (u32)-1) + { + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + NEG(32, gpr.R(d)); + } + else if ((imm & (imm - 1)) == 0) + { + u32 shift = 0; + + if (imm & 0xFFFF0000) + shift |= 16; + + if (imm & 0xFF00FF00) + shift |= 8; + + if (imm & 0xF0F0F0F0) + shift |= 4; + + if (imm & 0xCCCCCCCC) + shift |= 2; + + if (imm & 0xAAAAAAAA) + shift |= 1; + + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + + if (shift) + SHL(32, gpr.R(d), Imm8(shift)); + } + else + { + IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); + } + gpr.UnlockAll(); } } @@ -1039,7 +1089,45 @@ void Jit64::mullwx(UGeckoInstruction inst) { u32 imm = gpr.R(a).IsImm() ? (u32)gpr.R(a).offset : (u32)gpr.R(b).offset; int src = gpr.R(a).IsImm() ? b : a; - MultiplyImmediate(imm, src, d, inst.OE); + if (imm == 0) + { + XOR(32, gpr.R(d), gpr.R(d)); + } + else if (imm == (u32)-1) + { + if (d != src) + MOV(32, gpr.R(d), gpr.R(src)); + NEG(32, gpr.R(d)); + } + else if ((imm & (imm - 1)) == 0 && !inst.OE) + { + u32 shift = 0; + + if (imm & 0xFFFF0000) + shift |= 16; + + if (imm & 0xFF00FF00) + shift |= 8; + + if (imm & 0xF0F0F0F0) + shift |= 4; + + if (imm & 0xCCCCCCCC) + shift |= 2; + + if (imm & 0xAAAAAAAA) + shift |= 1; + + if (d != src) + MOV(32, gpr.R(d), gpr.R(src)); + + if (shift) + SHL(32, gpr.R(d), Imm8(shift)); + } + else + { + IMUL(32, gpr.RX(d), gpr.R(src), Imm32(imm)); + } } else if (d == a) { @@ -1326,6 +1414,13 @@ void Jit64::addx(UGeckoInstruction inst) GenerateConstantOverflow((s64)i + (s64)j); } } + else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.Rc && !inst.OE) + { + gpr.Lock(a, b, d); + gpr.BindToRegister(d, false); + LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); + gpr.UnlockAll(); + } else if ((d == a) || (d == b)) { int operand = ((d == a) ? b : a); @@ -1338,15 +1433,6 @@ void Jit64::addx(UGeckoInstruction inst) ComputeRC(gpr.R(d)); gpr.UnlockAll(); } - else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.OE) - { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } else { gpr.Lock(a, b, d); @@ -1368,22 +1454,31 @@ void Jit64::addex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, b = inst.RB, d = inst.RD; - gpr.Lock(a, b, d); - gpr.BindToRegister(d, (d == a) || (d == b)); - JitGetAndClearCAOV(inst.OE); if ((d == a) || (d == b)) { + gpr.Lock(a, b, d); + gpr.BindToRegister(d, true); + + GetCarryEAXAndClear(); ADC(32, gpr.R(d), gpr.R((d == a) ? b : a)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } else { + gpr.Lock(a, b, d); + gpr.BindToRegister(d, false); + + GetCarryEAXAndClear(); MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), gpr.R(b)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); } void Jit64::addcx(UGeckoInstruction inst) @@ -1397,9 +1492,9 @@ void Jit64::addcx(UGeckoInstruction inst) int operand = ((d == a) ? b : a); gpr.Lock(a, b, d); gpr.BindToRegister(d, true); - JitClearCAOV(js.op->wantsCA, inst.OE); + JitClearCAOV(inst.OE); ADD(32, gpr.R(d), gpr.R(operand)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1408,10 +1503,10 @@ void Jit64::addcx(UGeckoInstruction inst) { gpr.Lock(a, b, d); gpr.BindToRegister(d, false); - JitClearCAOV(js.op->wantsCA, inst.OE); + JitClearCAOV(inst.OE); MOV(32, gpr.R(d), gpr.R(a)); ADD(32, gpr.R(d), gpr.R(b)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1425,16 +1520,31 @@ void Jit64::addmex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - gpr.Lock(d); - gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); - if (d != a) + if (d == a) + { + gpr.Lock(d); + gpr.BindToRegister(d, true); + + GetCarryEAXAndClear(); + ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } + else + { + gpr.Lock(a, d); + gpr.BindToRegister(d, false); + + GetCarryEAXAndClear(); MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); + ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } } void Jit64::addzex(UGeckoInstruction inst) @@ -1444,16 +1554,31 @@ void Jit64::addzex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - gpr.Lock(d); - gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); - if (d != a) + if (d == a) + { + gpr.Lock(d); + gpr.BindToRegister(d, true); + + GetCarryEAXAndClear(); + ADC(32, gpr.R(d), Imm8(0)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } + else + { + gpr.Lock(a, d); + gpr.BindToRegister(d, false); + + GetCarryEAXAndClear(); MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(js.op->wantsCA, inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); + ADC(32, gpr.R(d), Imm8(0)); + FinalizeCarryGenerateOverflowEAX(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } } void Jit64::rlwinmx(UGeckoInstruction inst) @@ -1462,11 +1587,6 @@ void Jit64::rlwinmx(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA; int s = inst.RS; - - // rlwinm is commonly used as a branch test, second only to the more obvious cmpw. - // since it's almost never used with any check other than beq, only support beq for simplicity. - bool merge_branch = inst.Rc && CheckMergedBranch(0) && (js.next_inst.BI & 3) == 2; - if (gpr.R(s).IsImm()) { u32 result = (int)gpr.R(s).offset; @@ -1475,104 +1595,49 @@ void Jit64::rlwinmx(UGeckoInstruction inst) result &= Helper_Mask(inst.MB, inst.ME); gpr.SetImmediate32(a, result); if (inst.Rc) + { ComputeRC(gpr.R(a)); + } } else { - bool isLeftShift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH; - bool isRightShift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH; - u32 mask = Helper_Mask(inst.MB, inst.ME); - bool simpleMask = mask == 0xff || mask == 0xffff; - // in case of a merged branch, track whether or not we've set flags. - // if not, we need to do a TEST later to get them. - bool needsTest = false; - // if we know the high bit can't be set, we can avoid doing a sign extend for flag storage - bool needsSext = true; - int maskSize = inst.ME - inst.MB + 1; - gpr.Lock(a, s); gpr.BindToRegister(a, a == s); - if (a != s && isLeftShift && gpr.R(s).IsSimpleReg() && inst.SH <= 3) + if (a != s) { - LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0)); - needsTest = true; + MOV(32, gpr.R(a), gpr.R(s)); } - // common optimized case: byte/word extract - else if (simpleMask && !(inst.SH & (maskSize - 1))) + + if (inst.SH && inst.MB == 0 && inst.ME==31-inst.SH) { - MOVZX(32, maskSize, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0)); - needsTest = true; - needsSext = false; - } - // another optimized special case: byte/word extract plus shift - else if (((mask >> inst.SH) << inst.SH) == mask && !isLeftShift && - ((mask >> inst.SH) == 0xff || (mask >> inst.SH) == 0xffff)) - { - MOVZX(32, maskSize, gpr.RX(a), gpr.R(s)); SHL(32, gpr.R(a), Imm8(inst.SH)); - needsSext = inst.SH + maskSize >= 32; + if (inst.Rc) + ComputeRC(gpr.R(a)); + } + else if (inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH) + { + SHR(32, gpr.R(a), Imm8(inst.MB)); + if (inst.Rc) + ComputeRC(gpr.R(a)); } else { - if (a != s) - MOV(32, gpr.R(a), gpr.R(s)); + if (inst.SH != 0) + { + ROL(32, gpr.R(a), Imm8(inst.SH)); + } - if (isLeftShift) + if (!(inst.MB==0 && inst.ME==31)) { - SHL(32, gpr.R(a), Imm8(inst.SH)); + AND(32, gpr.R(a), Imm32(Helper_Mask(inst.MB, inst.ME))); + if (inst.Rc) + ComputeRC(gpr.R(a)); } - else if (isRightShift) + else if (inst.Rc) { - SHR(32, gpr.R(a), Imm8(inst.MB)); - needsSext = false; - } - else - { - if (inst.SH != 0) - ROL(32, gpr.R(a), Imm8(inst.SH)); - if (!(inst.MB == 0 && inst.ME == 31)) - { - AndWithMask(gpr.RX(a), mask); - needsSext = inst.MB == 0; - needsTest = simpleMask; - } - else - { - needsTest = true; - } + ComputeRC(gpr.R(a)); } } - if (merge_branch) - { - js.downcountAmount++; - js.skipnext = true; - - if (needsSext) - MOVSX(64, 32, gpr.RX(a), gpr.R(a)); - MOV(64, M(&PowerPC::ppcState.cr_val[0]), gpr.R(a)); - - if (needsTest) - TEST(32, gpr.R(a), gpr.R(a)); - - gpr.UnlockAll(); - FixupBranch pDontBranch = J_CC((js.next_inst.BO & BO_BRANCH_IF_TRUE) ? CC_NE : CC_E, true); - - gpr.Flush(FLUSH_MAINTAIN_STATE); - fpr.Flush(FLUSH_MAINTAIN_STATE); - - DoMergedBranch(); - - SetJumpTarget(pDontBranch); - - if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) - { - gpr.Flush(); - fpr.Flush(); - WriteExit(js.next_compilerPC + 4); - } - } - else if (inst.Rc) - ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1590,89 +1655,75 @@ void Jit64::rlwimix(UGeckoInstruction inst) u32 mask = Helper_Mask(inst.MB,inst.ME); gpr.SetImmediate32(a, ((u32)gpr.R(a).offset & ~mask) | (_rotl((u32)gpr.R(s).offset,inst.SH) & mask)); if (inst.Rc) + { ComputeRC(gpr.R(a)); + } } else { gpr.Lock(a, s); + gpr.BindToRegister(a, true, true); u32 mask = Helper_Mask(inst.MB, inst.ME); if (mask == 0 || (a == s && inst.SH == 0)) { - // nothing to do + if (inst.Rc) + { + ComputeRC(gpr.R(a)); + } } else if (mask == 0xFFFFFFFF) { - gpr.BindToRegister(a, a == s, true); if (a != s) + { MOV(32, gpr.R(a), gpr.R(s)); + } + if (inst.SH) + { ROL(32, gpr.R(a), Imm8(inst.SH)); - } - else if(gpr.R(s).IsImm()) - { - gpr.BindToRegister(a, true, true); - AndWithMask(gpr.RX(a), ~mask); - OR(32, gpr.R(a), Imm32(_rotl((u32)gpr.R(s).offset, inst.SH) & mask)); + } + + if (inst.Rc) + { + ComputeRC(gpr.R(a)); + } } else if (inst.SH) { - bool isLeftShift = mask == 0U - (1U << inst.SH); - bool isRightShift = mask == (1U << inst.SH) - 1; - if (gpr.R(a).IsImm()) + if (mask == 0U - (1U << inst.SH)) { - u32 maskA = gpr.R(a).offset & ~mask; - gpr.BindToRegister(a, false, true); - MOV(32, gpr.R(a), gpr.R(s)); - if (isLeftShift) - { - SHL(32, gpr.R(a), Imm8(inst.SH)); - } - else if (isRightShift) - { - SHR(32, gpr.R(a), Imm8(32 - inst.SH)); - } - else - { - ROL(32, gpr.R(a), Imm8(inst.SH)); - AND(32, gpr.R(a), Imm32(mask)); - } - OR(32, gpr.R(a), Imm32(maskA)); + MOV(32, R(EAX), gpr.R(s)); + SHL(32, R(EAX), Imm8(inst.SH)); + AND(32, gpr.R(a), Imm32(~mask)); + OR(32, gpr.R(a), R(EAX)); + } + else if (mask == (1U << inst.SH) - 1) + { + MOV(32, R(EAX), gpr.R(s)); + SHR(32, R(EAX), Imm8(32-inst.SH)); + AND(32, gpr.R(a), Imm32(~mask)); + OR(32, gpr.R(a), R(EAX)); } else { - // TODO: common cases of this might be faster with pinsrb or abuse of AH - gpr.BindToRegister(a, true, true); MOV(32, R(EAX), gpr.R(s)); - if (isLeftShift) - { - SHL(32, R(EAX), Imm8(inst.SH)); - AndWithMask(gpr.RX(a), ~mask); - OR(32, gpr.R(a), R(EAX)); - } - else if (isRightShift) - { - SHR(32, R(EAX), Imm8(32 - inst.SH)); - AndWithMask(gpr.RX(a), ~mask); - OR(32, gpr.R(a), R(EAX)); - } - else - { - ROL(32, R(EAX), Imm8(inst.SH)); - XOR(32, R(EAX), gpr.R(a)); - AndWithMask(EAX, mask); - XOR(32, gpr.R(a), R(EAX)); - } + ROL(32, R(EAX), Imm8(inst.SH)); + XOR(32, R(EAX), gpr.R(a)); + AND(32, R(EAX), Imm32(mask)); + XOR(32, gpr.R(a), R(EAX)); } + + if (inst.Rc) + ComputeRC(gpr.R(a)); } else { - gpr.BindToRegister(a, true, true); XOR(32, gpr.R(a), gpr.R(s)); - AndWithMask(gpr.RX(a), ~mask); + AND(32, gpr.R(a), Imm32(~mask)); XOR(32, gpr.R(a), gpr.R(s)); + if (inst.Rc) + ComputeRC(gpr.R(a)); } - if (inst.Rc) - ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1696,14 +1747,14 @@ void Jit64::rlwnmx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); + gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); - gpr.BindToRegister(a, (a == s), true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); } ROL(32, gpr.R(a), R(ECX)); - AndWithMask(gpr.RX(a), mask); + AND(32, gpr.R(a), Imm32(mask)); if (inst.Rc) ComputeRC(gpr.R(a)); gpr.UnlockAll(); @@ -1763,8 +1814,8 @@ void Jit64::srwx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); + gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); - gpr.BindToRegister(a, a == s, true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); @@ -1801,10 +1852,12 @@ void Jit64::slwx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); + gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); - gpr.BindToRegister(a, a == s, true); if (a != s) + { MOV(32, gpr.R(a), gpr.R(s)); + } SHL(64, gpr.R(a), R(ECX)); if (inst.Rc) { @@ -1831,27 +1884,25 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.Lock(a, s, b); gpr.FlushLockX(ECX); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCAOV(js.op->wantsCA, false); + JitClearCA(); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); SHL(64, gpr.R(a), Imm8(32)); SAR(64, gpr.R(a), R(ECX)); - if (js.op->wantsCA) - { - MOV(32, R(EAX), gpr.R(a)); - SHR(64, gpr.R(a), Imm8(32)); - TEST(32, gpr.R(a), R(EAX)); - JitSetCAIf(CC_NZ); - } - else - { - SHR(64, gpr.R(a), Imm8(32)); - } + MOV(32, R(EAX), gpr.R(a)); + SHR(64, gpr.R(a), Imm8(32)); + TEST(32, gpr.R(a), R(EAX)); + FixupBranch nocarry = J_CC(CC_Z); + JitSetCA(); + SetJumpTarget(nocarry); gpr.UnlockAll(); gpr.UnlockAllX(); + if (inst.Rc) + { ComputeRC(gpr.R(a)); + } } void Jit64::srawix(UGeckoInstruction inst) @@ -1865,56 +1916,39 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - if (!js.op->wantsCA) + JitClearCA(); + MOV(32, R(EAX), gpr.R(s)); + if (a != s) { - if (a != s) - MOV(32, gpr.R(a), gpr.R(s)); - SAR(32, gpr.R(a), Imm8(amount)); - } - else - { - JitClearCAOV(true, false); - MOV(32, R(EAX), gpr.R(s)); - if (a != s) - MOV(32, gpr.R(a), R(EAX)); - // some optimized common cases that can be done in slightly fewer ops - if (amount == 31) - { - SAR(32, gpr.R(a), Imm8(31)); - NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input; - AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000 - SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT)); - XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN) - } - else if (amount == 1) - { - SHR(32, R(EAX), Imm8(31)); // sign - AND(32, R(EAX), gpr.R(a)); // (sign && carry) - SAR(32, gpr.R(a), Imm8(1)); - SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); - OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 - } - else - { - SAR(32, gpr.R(a), Imm8(amount)); - SHL(32, R(EAX), Imm8(32 - amount)); - TEST(32, R(EAX), gpr.R(a)); - JitSetCAIf(CC_NZ); - } + MOV(32, gpr.R(a), R(EAX)); } + SAR(32, gpr.R(a), Imm8(amount)); + if (inst.Rc) + ComputeRC(gpr.R(a)); + SHL(32, R(EAX), Imm8(32-amount)); + TEST(32, R(EAX), gpr.R(a)); + FixupBranch nocarry = J_CC(CC_Z); + JitSetCA(); + SetJumpTarget(nocarry); + gpr.UnlockAll(); } else { gpr.Lock(a, s); - JitClearCAOV(js.op->wantsCA, false); + JitClearCA(); gpr.BindToRegister(a, a == s, true); if (a != s) + { MOV(32, gpr.R(a), gpr.R(s)); + } + + if (inst.Rc) + { + ComputeRC(gpr.R(a)); + } + gpr.UnlockAll(); } - if (inst.Rc) - ComputeRC(gpr.R(a)); - gpr.UnlockAll(); } // count leading zeroes @@ -1950,7 +1984,10 @@ void Jit64::cntlzwx(UGeckoInstruction inst) } if (inst.Rc) + { ComputeRC(gpr.R(a)); + // TODO: Check PPC manual too + } } void Jit64::twx(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index 97b32d9357..6798f390cc 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1104,7 +1104,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->JitSetCA(); FixupBranch cont = Jit->J(); Jit->SetJumpTarget(nocarry); - Jit->JitClearCAOV(true, false); + Jit->JitClearCA(); Jit->SetJumpTarget(cont); regNormalRegClear(RI, I); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 1c04a7c7f1..58340b072e 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -802,11 +802,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) OR(32, M(&FPSCR), R(EAX)); } -void EmuCodeBlock::JitGetAndClearCAOV(bool oe) + +void EmuCodeBlock::JitClearCA() { - if (oe) - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0 - BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0 + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 } void EmuCodeBlock::JitSetCA() @@ -814,20 +813,10 @@ void EmuCodeBlock::JitSetCA() OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 } -// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so -// branchless calculation of CA is probably faster in general. -void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) +void EmuCodeBlock::JitClearCAOV(bool oe) { - SETcc(conditionCode, R(EAX)); - MOVZX(32, 8, EAX, R(AL)); - SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); - OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1 -} - -void EmuCodeBlock::JitClearCAOV(bool ca, bool oe) -{ - u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF); - if (mask == 0xFFFFFFFF) - return; - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask)); + if (oe) + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0 + else + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 579215a171..addce16e93 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -50,10 +50,9 @@ public: void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); - void JitGetAndClearCAOV(bool oe); + void JitClearCA(); void JitSetCA(); - void JitSetCAIf(Gen::CCFlags conditionCode); - void JitClearCAOV(bool ca, bool oe); + void JitClearCAOV(bool oe); void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 741b453739..2c81a8447a 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -430,6 +430,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf { code->wantsCR0 = false; code->wantsCR1 = false; + code->wantsPS1 = false; if (opinfo->flags & FL_USE_FPU) block->m_fpa->any = true; @@ -457,15 +458,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false; code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false; - code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false; - code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false; - - // mfspr/mtspr can affect/use XER, so be super careful here - if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr - code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; - if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr - code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; - int numOut = 0; int numIn = 0; if (opinfo->flags & FL_OUT_A) @@ -723,30 +715,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 block->m_broken = true; } - // Scan for flag dependencies; assume the next block (or any branch that can leave the block) - // wants flags, to be safe. + // Scan for CR0 dependency + // assume next block wants flags to be safe bool wantsCR0 = true; bool wantsCR1 = true; + bool wantsPS1 = true; bool wantsFPRF = true; - bool wantsCA = true; for (int i = block->m_num_instructions - 1; i >= 0; i--) { - bool opWantsCR0 = code[i].wantsCR0; - bool opWantsCR1 = code[i].wantsCR1; - bool opWantsFPRF = code[i].wantsFPRF; - bool opWantsCA = code[i].wantsCA; - wantsCR0 |= opWantsCR0 || code[i].canEndBlock; - wantsCR1 |= opWantsCR1 || code[i].canEndBlock; - wantsFPRF |= opWantsFPRF || code[i].canEndBlock; - wantsCA |= opWantsCA || code[i].canEndBlock; - code[i].wantsCR0 = wantsCR0; - code[i].wantsCR1 = wantsCR1; + wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock; + wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock; + wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock; + wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock; + code[i].wantsCR0 = wantsCR0; + code[i].wantsCR1 = wantsCR1; + code[i].wantsPS1 = wantsPS1; code[i].wantsFPRF = wantsFPRF; - code[i].wantsCA = wantsCA; - wantsCR0 &= !code[i].outputCR0 || opWantsCR0; - wantsCR1 &= !code[i].outputCR1 || opWantsCR1; - wantsFPRF &= !code[i].outputFPRF || opWantsFPRF; - wantsCA &= !code[i].outputCA || opWantsCA; + wantsCR0 &= !code[i].outputCR0; + wantsCR1 &= !code[i].outputCR1; + wantsPS1 &= !code[i].outputPS1; + wantsFPRF &= !code[i].outputFPRF; } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 2177889336..0916e3951e 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -33,12 +33,12 @@ struct CodeOp //16B bool isBranchTarget; bool wantsCR0; bool wantsCR1; + bool wantsPS1; bool wantsFPRF; - bool wantsCA; bool outputCR0; bool outputCR1; + bool outputPS1; bool outputFPRF; - bool outputCA; bool canEndBlock; bool skip; // followed BL-s for example }; diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp index aa7bbdd971..18c276b8fe 100644 --- a/Source/Core/DolphinWX/GameListCtrl.cpp +++ b/Source/Core/DolphinWX/GameListCtrl.cpp @@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size) // Find largest power of 2 less than _size. // div 10 to get largest named unit less than _size // 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc) - const u64 unit = IntLog2(std::max(_size, 1)) / 10; + const u64 unit = Log2(std::max(_size, 1)) / 10; const u64 unit_size = (1 << (unit * 10)); // mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 43cb1cb61b..91d4692b08 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -23,7 +23,7 @@ static u32 genBuffer() } StreamBuffer::StreamBuffer(u32 type, u32 size) -: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS)) +: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) { m_iterator = 0; m_used_iterator = 0; diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index b0215e4c08..76e49ce464 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); - WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples)); - WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1)); + WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); + WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); if (samples == 1) { // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments @@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); } WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); - WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1); + WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); diff --git a/Source/UnitTests/Common/MathUtilTest.cpp b/Source/UnitTests/Common/MathUtilTest.cpp index 9549039304..8ae757962c 100644 --- a/Source/UnitTests/Common/MathUtilTest.cpp +++ b/Source/UnitTests/Common/MathUtilTest.cpp @@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN) EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits::signaling_NaN())); } -TEST(MathUtil, IntLog2) +TEST(MathUtil, Log2) { - EXPECT_EQ(0, IntLog2(1)); - EXPECT_EQ(1, IntLog2(2)); - EXPECT_EQ(2, IntLog2(4)); - EXPECT_EQ(3, IntLog2(8)); - EXPECT_EQ(63, IntLog2(0x8000000000000000ull)); + EXPECT_EQ(0, Log2(1)); + EXPECT_EQ(1, Log2(2)); + EXPECT_EQ(2, Log2(4)); + EXPECT_EQ(3, Log2(8)); + EXPECT_EQ(63, Log2(0x8000000000000000ull)); // Rounding behavior. - EXPECT_EQ(3, IntLog2(15)); - EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull)); + EXPECT_EQ(3, Log2(15)); + EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull)); } TEST(MathUtil, FlushToZero)