diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index ce3c0a5103..c0d19f652f 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1373,6 +1373,10 @@ void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) { EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? ARM64Reg::SP : ARM64Reg::WSP); } +void ARM64XEmitter::CMN(ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, true, shift, imm, Rn, Is64Bit(Rn) ? ARM64Reg::SP : ARM64Reg::WSP); +} // Data Processing (Immediate) void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a5a4c03e4e..5ea860a77a 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -896,7 +896,15 @@ public: CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); } void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } + void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) + { + SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); + } void NEGS(ARM64Reg Rd, ARM64Reg Rs) { SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } + void NEGS(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) + { + SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); + } // Data-Processing 1 source void RBIT(ARM64Reg Rd, ARM64Reg Rn); void REV16(ARM64Reg Rd, ARM64Reg Rn); @@ -1006,6 +1014,7 @@ public: void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void CMP(ARM64Reg Rn, u32 imm, bool shift = false); + void CMN(ARM64Reg Rn, u32 imm, bool shift = false); // Data Processing (Immediate) void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 6b2f94d80c..989a732cee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -10,10 +10,12 @@ #include "Core/CoreTiming.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitCommon/DivUtils.h" #include "Core/PowerPC/PPCTables.h" #include "Core/PowerPC/PowerPC.h" using namespace Arm64Gen; +using namespace JitCommon; #define CARRY_IF_NEEDED(inst_without_carry, inst_with_carry, ...) \ do \ @@ -1327,16 +1329,145 @@ void JitArm64::divwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(imm_d); } - else if (gpr.IsImm(b) && gpr.GetImm(b) != 0 && gpr.GetImm(b) != UINT32_C(0xFFFFFFFF)) + else if (gpr.IsImm(a) && gpr.GetImm(a) == 0) { - ARM64Reg WA = gpr.GetReg(); - MOVI2R(WA, gpr.GetImm(b)); + // Zero divided by anything is always zero + gpr.SetImmediate(d, 0); + if (inst.Rc) + ComputeRC0(0); + } + else if (gpr.IsImm(a)) + { + const u32 dividend = gpr.GetImm(a); + + gpr.BindToRegister(d, d == b); + + ARM64Reg RB = gpr.R(b); + ARM64Reg RD = gpr.R(d); + + FixupBranch overflow1 = CBZ(RB); + FixupBranch overflow2; + if (dividend == 0x80000000) + { + CMN(RB, 1); + overflow2 = B(CC_EQ); + } + SDIV(RD, gpr.R(a), RB); + FixupBranch done = B(); + + SetJumpTarget(overflow1); + if (dividend == 0x80000000) + SetJumpTarget(overflow2); + + MOVI2R(RD, dividend & 0x80000000 ? 0xFFFFFFFF : 0); + + SetJumpTarget(done); + + if (inst.Rc) + ComputeRC0(RD); + } + else if (gpr.IsImm(b)) + { + const s32 divisor = s32(gpr.GetImm(b)); gpr.BindToRegister(d, d == a); - SDIV(gpr.R(d), gpr.R(a), WA); + // Handle 0, 1, and -1 explicitly + if (divisor == 0) + { + ASR(gpr.R(d), gpr.R(a), 31); + } + else if (divisor == 1) + { + if (d != a) + MOV(gpr.R(d), gpr.R(a)); + } + else if (divisor == -1) + { + // Rd = (Ra == 0x80000000) ? 0xFFFFFFFF : -Ra + NEGS(gpr.R(d), gpr.R(a)); + CSINV(gpr.R(d), gpr.R(d), ARM64Reg::WZR, CCFlags::CC_VC); + } + else if (divisor == 2 || divisor == -2) + { + ARM64Reg RA = gpr.R(a); + ARM64Reg RD = gpr.R(d); - gpr.Unlock(WA); + ADD(RD, RA, RA, ArithOption(RA, ShiftType::LSR, 31)); + + if (divisor < 0) + NEG(RD, RD, ArithOption(RD, ShiftType::ASR, 1)); + else + ASR(RD, RD, 1); + } + else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-static_cast(divisor))) + { + const u32 abs_val = static_cast(std::abs(static_cast(divisor))); + + ARM64Reg RA = gpr.R(a); + ARM64Reg RD = gpr.R(d); + + const bool allocate_reg = a == d; + ARM64Reg WA = allocate_reg ? gpr.GetReg() : RD; + + TST(RA, RA); + ADDI2R(WA, RA, abs_val - 1, WA); + CSEL(WA, RA, WA, CCFlags::CC_PL); + + if (divisor < 0) + NEG(RD, WA, ArithOption(WA, ShiftType::ASR, IntLog2(abs_val))); + else + ASR(RD, WA, IntLog2(abs_val)); + + if (allocate_reg) + gpr.Unlock(WA); + } + else + { + // Optimize signed 32-bit integer division by a constant + Magic m = SignedDivisionConstants(divisor); + + ARM64Reg WA = gpr.GetReg(); + ARM64Reg WB = gpr.GetReg(); + ARM64Reg RD = gpr.R(d); + + ARM64Reg XA = EncodeRegTo64(WA); + ARM64Reg XB = EncodeRegTo64(WB); + ARM64Reg XD = EncodeRegTo64(RD); + + SXTW(XA, gpr.R(a)); + MOVI2R(XB, s64(m.multiplier)); + + if (divisor > 0 && m.multiplier < 0) + { + MUL(XD, XA, XB); + ADD(XD, XA, XD, ArithOption(XD, ShiftType::LSR, 32)); + LSR(WA, WA, 31); + ADD(RD, WA, RD, ArithOption(RD, ShiftType::ASR, m.shift)); + } + else if (divisor < 0 && m.multiplier > 0) + { + MNEG(XD, XA, XB); + ADD(XA, XD, XA, ArithOption(XA, ShiftType::LSR, 32)); + LSR(RD, WA, 31); + ADD(RD, RD, WA, ArithOption(WA, ShiftType::ASR, m.shift)); + } + else if (m.multiplier > 0) + { + MUL(XD, XA, XB); + ASR(XD, XD, 32 + m.shift); + ADD(RD, RD, WA, ArithOption(WA, ShiftType::LSR, 31)); + } + else + { + MUL(XD, XA, XB); + LSR(XA, XD, 63); + ASR(XD, XD, 32 + m.shift); + ADD(RD, WA, RD); + } + + gpr.Unlock(WA, WB); + } if (inst.Rc) ComputeRC0(gpr.R(d)); @@ -1347,28 +1478,24 @@ void JitArm64::divwx(UGeckoInstruction inst) gpr.BindToRegister(d, d == a || d == b); - ARM64Reg WA = gpr.GetReg(); ARM64Reg RA = gpr.R(a); ARM64Reg RB = gpr.R(b); ARM64Reg RD = gpr.R(d); - FixupBranch slow1 = CBZ(RB); - MOVI2R(WA, -0x80000000LL); - CMP(RA, WA); - CCMN(RB, 1, 0, CC_EQ); - FixupBranch slow2 = B(CC_EQ); + FixupBranch overflow1 = CBZ(RB); + NEGS(ARM64Reg::WZR, RA); // Is RA 0x80000000? + CCMN(RB, 1, 0, CC_VS); // Is RB -1? + FixupBranch overflow2 = B(CC_EQ); SDIV(RD, RA, RB); FixupBranch done = B(); - SetJumpTarget(slow1); - SetJumpTarget(slow2); + SetJumpTarget(overflow1); + SetJumpTarget(overflow2); ASR(RD, RA, 31); SetJumpTarget(done); - gpr.Unlock(WA); - if (inst.Rc) ComputeRC0(RD); }