diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 664f8a6525..f8de18d195 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -374,7 +374,8 @@ void JitArm64::EmitStoreMembase(const ARM64Reg& msr) gpr.Unlock(WD); } -void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return) +void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return, + ARM64Reg exit_address_after_return_reg) { Cleanup(); EndTimeProfile(js.curBlock); @@ -386,11 +387,16 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return if (LK) { // Push {ARM_PC; PPC_PC} on the stack - MOVI2R(ARM64Reg::X1, exit_address_after_return); + ARM64Reg reg_to_push = exit_address_after_return_reg; + if (exit_address_after_return_reg == ARM64Reg::INVALID_REG) + { + MOVI2R(ARM64Reg::X1, exit_address_after_return); + reg_to_push = ARM64Reg::X1; + } constexpr s32 adr_offset = JitArm64BlockCache::BLOCK_LINK_SIZE + sizeof(u32) * 2; host_address_after_return = GetCodePtr() + adr_offset; ADR(ARM64Reg::X0, adr_offset); - STP(IndexType::Pre, ARM64Reg::X0, ARM64Reg::X1, ARM64Reg::SP, -16); + STP(IndexType::Pre, ARM64Reg::X0, reg_to_push, ARM64Reg::SP, -16); } constexpr size_t primary_farcode_size = 3 * sizeof(u32); @@ -457,7 +463,8 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return SwitchToNearCode(); } -void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return) +void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return, + ARM64Reg exit_address_after_return_reg) { if (dest != DISPATCHER_PC) MOV(DISPATCHER_PC, dest); @@ -475,11 +482,17 @@ void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_afte else { // Push {ARM_PC, PPC_PC} on the stack + ARM64Reg reg_to_push = exit_address_after_return_reg; + if (exit_address_after_return_reg == ARM64Reg::INVALID_REG) + { + MOVI2R(ARM64Reg::X1, exit_address_after_return); + reg_to_push = ARM64Reg::X1; + } MOVI2R(ARM64Reg::X1, exit_address_after_return); constexpr s32 adr_offset = sizeof(u32) * 3; const u8* host_address_after_return = GetCodePtr() + adr_offset; ADR(ARM64Reg::X0, adr_offset); - STP(IndexType::Pre, ARM64Reg::X0, ARM64Reg::X1, ARM64Reg::SP, -16); + STP(IndexType::Pre, ARM64Reg::X0, reg_to_push, ARM64Reg::SP, -16); BL(dispatcher); DEBUG_ASSERT(GetCodePtr() == host_address_after_return || HasWriteFailed()); @@ -515,26 +528,43 @@ void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_afte } } -void JitArm64::FakeLKExit(u32 exit_address_after_return) +void JitArm64::FakeLKExit(u32 exit_address_after_return, ARM64Reg exit_address_after_return_reg) { if (!m_enable_blr_optimization) return; // We may need to fake the BLR stack on inlined CALL instructions. // Else we can't return to this location any more. - gpr.Lock(ARM64Reg::W30); - ARM64Reg after_reg = gpr.GetReg(); + if (exit_address_after_return_reg != ARM64Reg::W30) + { + // Do not lock W30 if it is the same as the exit address register, since + // it's already locked. It'll only get clobbered at the BL (below) where + // we do not need its value anymore. + // NOTE: This means W30 won't contain the return address anymore after this + // function has been called! + gpr.Lock(ARM64Reg::W30); + } + ARM64Reg after_reg = exit_address_after_return_reg; + if (exit_address_after_return_reg == ARM64Reg::INVALID_REG) + { + after_reg = gpr.GetReg(); + MOVI2R(after_reg, exit_address_after_return); + } ARM64Reg code_reg = gpr.GetReg(); - MOVI2R(after_reg, exit_address_after_return); constexpr s32 adr_offset = sizeof(u32) * 3; const u8* host_address_after_return = GetCodePtr() + adr_offset; ADR(EncodeRegTo64(code_reg), adr_offset); STP(IndexType::Pre, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), ARM64Reg::SP, -16); - gpr.Unlock(after_reg, code_reg); + gpr.Unlock(code_reg); + if (after_reg != exit_address_after_return_reg) + gpr.Unlock(after_reg); FixupBranch skip_exit = BL(); DEBUG_ASSERT(GetCodePtr() == host_address_after_return || HasWriteFailed()); - gpr.Unlock(ARM64Reg::W30); + if (exit_address_after_return_reg != ARM64Reg::W30) + { + gpr.Unlock(ARM64Reg::W30); + } // Write the regular exit node after the return. JitBlock* b = js.curBlock; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index df71e70a93..7025ffaa2d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -315,8 +315,12 @@ protected: void EmitStoreMembase(const Arm64Gen::ARM64Reg& msr); // Exits - void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0); - void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0); + void + WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0, + Arm64Gen::ARM64Reg exit_address_after_return_reg = Arm64Gen::ARM64Reg::INVALID_REG); + void + WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0, + Arm64Gen::ARM64Reg exit_address_after_return_reg = Arm64Gen::ARM64Reg::INVALID_REG); void WriteExceptionExit(u32 destination, bool only_external = false, bool always_exception = false); void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false, @@ -325,7 +329,9 @@ protected: void WriteConditionalExceptionExit(int exception, Arm64Gen::ARM64Reg temp_gpr, Arm64Gen::ARM64Reg temp_fpr = Arm64Gen::ARM64Reg::INVALID_REG, u64 increment_sp_on_exit = 0); - void FakeLKExit(u32 exit_address_after_return); + void + FakeLKExit(u32 exit_address_after_return, + Arm64Gen::ARM64Reg exit_address_after_return_reg = Arm64Gen::ARM64Reg::INVALID_REG); void WriteBLRExit(Arm64Gen::ARM64Reg dest); Arm64Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index edb31f7d64..62f04584ec 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -79,12 +79,12 @@ void JitArm64::bx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITBranchOff); + ARM64Reg WA = ARM64Reg::INVALID_REG; if (inst.LK) { - ARM64Reg WA = gpr.GetReg(); + WA = gpr.GetReg(); MOVI2R(WA, js.compilerPC + 4); STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF_SPR(SPR_LR)); - gpr.Unlock(WA); } if (!js.isLastInstruction) @@ -94,8 +94,12 @@ void JitArm64::bx(UGeckoInstruction inst) // We have to fake the stack as the RET instruction was not // found in the same block. This is a big overhead, but still // better than calling the dispatcher. - FakeLKExit(js.compilerPC + 4); + FakeLKExit(js.compilerPC + 4, WA); } + + if (WA != ARM64Reg::INVALID_REG) + gpr.Unlock(WA); + return; } @@ -104,19 +108,24 @@ void JitArm64::bx(UGeckoInstruction inst) if (js.op->branchIsIdleLoop) { - // make idle loops go faster - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); + if (WA != ARM64Reg::INVALID_REG) + gpr.Unlock(WA); - MOVP2R(XA, &CoreTiming::GlobalIdle); - BLR(XA); - gpr.Unlock(WA); + // make idle loops go faster + ARM64Reg WB = gpr.GetReg(); + ARM64Reg XB = EncodeRegTo64(WB); + + MOVP2R(XB, &CoreTiming::GlobalIdle); + BLR(XB); + gpr.Unlock(WB); WriteExceptionExit(js.op->branchTo); return; } - WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, inst.LK ? WA : ARM64Reg::INVALID_REG); + if (WA != ARM64Reg::INVALID_REG) + gpr.Unlock(WA); } void JitArm64::bcx(UGeckoInstruction inst) @@ -125,6 +134,8 @@ void JitArm64::bcx(UGeckoInstruction inst) JITDISABLE(bJITBranchOff); ARM64Reg WA = gpr.GetReg(); + ARM64Reg WB = inst.LK ? gpr.GetReg() : WA; + FixupBranch pCTRDontBranch; if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) // Decrement and test CTR { @@ -156,7 +167,7 @@ void JitArm64::bcx(UGeckoInstruction inst) STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF_SPR(SPR_LR)); } - gpr.Flush(FlushMode::MaintainState, WA); + gpr.Flush(FlushMode::MaintainState, WB); fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG); if (js.op->branchIsIdleLoop) @@ -171,7 +182,7 @@ void JitArm64::bcx(UGeckoInstruction inst) } else { - WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4, inst.LK ? WA : ARM64Reg::INVALID_REG); } SwitchToNearCode(); @@ -189,6 +200,8 @@ void JitArm64::bcx(UGeckoInstruction inst) } gpr.Unlock(WA); + if (WB != WA) + gpr.Unlock(WB); } void JitArm64::bcctrx(UGeckoInstruction inst) @@ -211,12 +224,12 @@ void JitArm64::bcctrx(UGeckoInstruction inst) gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + ARM64Reg WB = ARM64Reg::INVALID_REG; if (inst.LK_3) { - ARM64Reg WB = gpr.GetReg(); + WB = gpr.GetReg(); MOVI2R(WB, js.compilerPC + 4); STR(IndexType::Unsigned, WB, PPC_REG, PPCSTATE_OFF_SPR(SPR_LR)); - gpr.Unlock(WB); } ARM64Reg WA = gpr.GetReg(); @@ -224,8 +237,10 @@ void JitArm64::bcctrx(UGeckoInstruction inst) LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR)); AND(WA, WA, LogicalImm(~0x3, 32)); - WriteExit(WA, inst.LK_3, js.compilerPC + 4); + WriteExit(WA, inst.LK_3, js.compilerPC + 4, inst.LK_3 ? WB : ARM64Reg::INVALID_REG); + if (WB != ARM64Reg::INVALID_REG) + gpr.Unlock(WB); gpr.Unlock(WA); }