From b597ec3e081a289d9ac782586617a876535183d6 Mon Sep 17 00:00:00 2001 From: comex Date: Sun, 7 Sep 2014 16:36:25 -0400 Subject: [PATCH] Opportunistically predict BLR destinations using RET. When executing a BL-type instruction, push the new LR onto the stack, then CALL the dispatcher or linked block rather than JMPing to it. When executing BLR, compare [rsp+8] to LR, and RET if it's right, which it usually will be unless the thread was switched out. If it's not right, reset RSP to avoid overflow. This both saves a trip through the dispatcher and improves branch prediction. There is a small possibility of stack overflow anyway, which should be handled... *yawn* --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 92 +++++++++++++++++-- Source/Core/Core/PowerPC/Jit64/Jit.h | 8 +- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 35 ++++++- Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 10 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 6 +- .../Core/PowerPC/JitCommon/JitAsmCommon.h | 1 + .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 5 +- 7 files changed, 135 insertions(+), 22 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 585b7e136f..b3625c6727 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -227,31 +227,55 @@ static void ImHere() been_here[PC] = 1; } -void Jit64::Cleanup() +bool Jit64::Cleanup() { + bool did_something = false; + if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) { ABI_PushRegistersAndAdjustStack(0, 0); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); ABI_PopRegistersAndAdjustStack(0, 0); + did_something = true; } // SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time. if (MMCR0.Hex || MMCR1.Hex) + { ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); + did_something = true; + } + + return did_something; } -void Jit64::WriteExit(u32 destination) +void Jit64::WriteExit(u32 destination, bool bl, u32 after) { + // BLR optimization has similar consequences to block linking. + if (!jo.enableBlocklink) + { + bl = false; + } + Cleanup(); + if (bl) + { + MOV(32, R(RSCRATCH2), Imm32(after)); + PUSH(RSCRATCH2); + } + SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); + JustWriteExit(destination, bl, after); +} + +void Jit64::JustWriteExit(u32 destination, bool bl, u32 after) +{ //If nobody has taken care of this yet (this can be removed when all branches are done) JitBlock *b = js.curBlock; JitBlock::LinkData linkData; linkData.exitAddress = destination; - linkData.exitPtrs = GetWritableCodePtr(); linkData.linkStatus = false; // Link opportunity! @@ -259,24 +283,78 @@ void Jit64::WriteExit(u32 destination) if (jo.enableBlocklink && (block = blocks.GetBlockNumberFromStartAddress(destination)) >= 0) { // It exists! Joy of joy! - JMP(blocks.GetBlock(block)->checkedEntry, true); + JitBlock* jb = blocks.GetBlock(block); + const u8* addr = jb->checkedEntry; + linkData.exitPtrs = GetWritableCodePtr(); + if (bl) + CALL(addr); + else + JMP(addr, true); linkData.linkStatus = true; } else { MOV(32, PPCSTATE(pc), Imm32(destination)); - JMP(asm_routines.dispatcher, true); + linkData.exitPtrs = GetWritableCodePtr(); + if (bl) + CALL(asm_routines.dispatcher); + else + JMP(asm_routines.dispatcher, true); } b->linkData.push_back(linkData); + + if (bl) + { + POP(RSCRATCH); + JustWriteExit(after, false, 0); + } } -void Jit64::WriteExitDestInRSCRATCH() +void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after) { + if (!jo.enableBlocklink) + { + bl = false; + } + if (bl) + { + MOV(32, R(RSCRATCH2), Imm32(after)); + PUSH(RSCRATCH2); + } MOV(32, PPCSTATE(pc), R(RSCRATCH)); Cleanup(); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); - JMP(asm_routines.dispatcher, true); + if (bl) + { + CALL(asm_routines.dispatcher); + POP(RSCRATCH); + JustWriteExit(after, false, 0); + } + else + { + JMP(asm_routines.dispatcher, true); + } +} + +void Jit64::WriteBLRExit() +{ + if (!jo.enableBlocklink) + { + WriteExitDestInRSCRATCH(); + return; + } + MOV(32, PPCSTATE(pc), R(RSCRATCH)); + bool disturbed = Cleanup(); + if (disturbed) + MOV(32, R(RSCRATCH), PPCSTATE(pc)); + CMP(64, R(RSCRATCH), MDisp(RSP, 8)); + FixupBranch nope = J_CC(CC_NE); + SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); + RET(); + SetJumpTarget(nope); + MOV(32, R(RSCRATCH), Imm32(js.downcountAmount)); + JMP(asm_routines.dispatcherMispredictedBLR, true); } void Jit64::WriteRfiExitDestInRSCRATCH() diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index d444f0f834..cface00cb3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -89,13 +89,15 @@ public: // Utilities for use by opcodes - void WriteExit(u32 destination); - void WriteExitDestInRSCRATCH(); + void WriteExit(u32 destination, bool bl = false, u32 after = 0); + void JustWriteExit(u32 destination, bool bl, u32 after); + void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0); + void WriteBLRExit(); void WriteExceptionExit(); void WriteExternalExceptionExit(); void WriteRfiExitDestInRSCRATCH(); void WriteCallInterpreter(UGeckoInstruction _inst); - void Cleanup(); + bool Cleanup(); void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index dfef055459..dc307540f6 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -9,6 +9,9 @@ using namespace Gen; +// Not PowerPC state. Can't put in 'this' because it's out of range... +static void* s_saved_rsp; + // PLAN: no more block numbers - crazy opcodes just contain offset within // dynarec buffer // At this offset - 4, there is an int specifying the block number. @@ -16,7 +19,13 @@ using namespace Gen; void Jit64AsmRoutineManager::Generate() { enterCode = AlignCode16(); - ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); + // We need to own the beginning of RSP, so we do an extra stack adjustment + // for the shadow region before calls in this function. This call will + // waste a bit of space for a second shadow, but whatever. + ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, /*frame*/ 16); + // something that can't pass the BLR test + MOV(64, M(&s_saved_rsp), R(RSP)); + MOV(64, MDisp(RSP, 8), Imm32((u32)-1)); // Two statically allocated registers. MOV(64, R(RMEM), Imm64((u64)Memory::base)); @@ -24,8 +33,22 @@ void Jit64AsmRoutineManager::Generate() MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80)); const u8* outerLoop = GetCodePtr(); + ABI_PushRegistersAndAdjustStack(0, 0); ABI_CallFunction(reinterpret_cast(&CoreTiming::Advance)); + ABI_PopRegistersAndAdjustStack(0, 0); FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time + dispatcherMispredictedBLR = GetCodePtr(); + + #if 0 // debug mispredicts + MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc + ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0); + CALL(reinterpret_cast(&ReportMispredict)); + ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0); + #endif + + MOV(64, R(RSP), M(&s_saved_rsp)); + + SUB(32, PPCSTATE(downcount), R(RSCRATCH)); dispatcher = GetCodePtr(); // The result of slice decrementation should be in flags if somebody jumped here @@ -36,10 +59,13 @@ void Jit64AsmRoutineManager::Generate() { TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING)); FixupBranch notStepping = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack(0, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckBreakPoints)); + ABI_PopRegistersAndAdjustStack(0, 0); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); FixupBranch noBreakpoint = J_CC(CC_Z); - ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); + MOV(64, R(RSP), M(&s_saved_rsp)); + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); RET(); SetJumpTarget(noBreakpoint); SetJumpTarget(notStepping); @@ -120,14 +146,17 @@ void Jit64AsmRoutineManager::Generate() FixupBranch noExtException = J_CC(CC_Z); MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, PPCSTATE(npc), R(RSCRATCH)); + ABI_PushRegistersAndAdjustStack(0, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExternalExceptions)); + ABI_PopRegistersAndAdjustStack(0, 0); SetJumpTarget(noExtException); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); J_CC(CC_Z, outerLoop); //Landing pad for drec space - ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); + MOV(64, R(RSP), M(&s_saved_rsp)); + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); RET(); GenerateCommon(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 8456d56b7c..2508fe1417 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -92,7 +92,7 @@ void Jit64::bx(UGeckoInstruction inst) // make idle loops go faster js.downcountAmount += 8; } - WriteExit(destination); + WriteExit(destination, inst.LK, js.compilerPC + 4); } // TODO - optimize to hell and beyond @@ -133,7 +133,7 @@ void Jit64::bcx(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - WriteExit(destination); + WriteExit(destination, inst.LK, js.compilerPC + 4); if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) SetJumpTarget( pConditionDontBranch ); @@ -168,7 +168,7 @@ void Jit64::bcctrx(UGeckoInstruction inst) if (inst.LK_3) MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); // LR = PC + 4; AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - WriteExitDestInRSCRATCH(); + WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4); } else { @@ -187,7 +187,7 @@ void Jit64::bcctrx(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - WriteExitDestInRSCRATCH(); + WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4); // Would really like to continue the block here, but it ends. TODO. SetJumpTarget(b); @@ -235,7 +235,7 @@ void Jit64::bclrx(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - WriteExitDestInRSCRATCH(); + WriteBLRExit(); if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) SetJumpTarget( pConditionDontBranch ); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index dbb6a5fbf1..79a7c5d76d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -312,7 +312,7 @@ void Jit64::DoMergedBranch() destination = SignExt16(js.next_inst.BD << 2); else destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); + WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4); } else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx { @@ -320,7 +320,7 @@ void Jit64::DoMergedBranch() MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); MOV(32, R(RSCRATCH), M(&CTR)); AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - WriteExitDestInRSCRATCH(); + WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4); } else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx { @@ -328,7 +328,7 @@ void Jit64::DoMergedBranch() AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); if (js.next_inst.LK) MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); - WriteExitDestInRSCRATCH(); + WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4); } else { diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 2702db95e1..c3f6a69b5c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -17,6 +17,7 @@ public: const u8 *enterCode; + const u8 *dispatcherMispredictedBLR; const u8 *dispatcher; const u8 *dispatcherNoCheck; const u8 *dispatcherPcInRSCRATCH; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index bf1ce35596..d8fc87f449 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -364,7 +364,10 @@ using namespace Gen; void JitBlockCache::WriteLinkBlock(u8* location, const u8* address) { XEmitter emit(location); - emit.JMP(address, true); + if (*location == 0xE8) + emit.CALL(address); + else + emit.JMP(address, true); } void JitBlockCache::WriteDestroyBlock(const u8* location, u32 address)