From a40ae6883a87884c45645f38ac831ffda0597e8d Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 16 Jun 2014 22:47:10 -0500 Subject: [PATCH] Move CoreTiming::downcount to PowerPC::ppcState. This isn't technically the correct place to have the downcount variable, but it is similar to what PPSSPP does to gain a bit of extra speed on ARM. We access this variable quite a bit, with each exit in a block it is subtracted from. On ARM this required four instructions to load and store the value, while now it only requires two. This gives an average of 1FPS gain to most games. Examples: Crazy Taxi: 54FPS -> 55FPS Luigi's Mansion: 20FPS -> 21FPS Wind Waker(Save Screen): 27FPS -> 28FPS This seems to average a 6mhz to 16mhz CPU core emulation improvement in the few games I've tested. --- Source/Core/Core/CoreTiming.cpp | 23 +++++++++---------- Source/Core/Core/CoreTiming.h | 1 - Source/Core/Core/FifoPlayer/FifoPlayer.cpp | 4 ++-- .../Core/PowerPC/Interpreter/Interpreter.cpp | 10 ++++---- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 10 ++++---- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp | 8 +++---- Source/Core/Core/PowerPC/JitArm32/Jit.cpp | 17 ++++++-------- Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 1 - Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp | 17 ++++++-------- .../Core/Core/PowerPC/JitArmIL/JitILAsm.cpp | 1 - Source/Core/Core/PowerPC/PowerPC.h | 5 ++++ Source/Core/Core/State.cpp | 2 +- 13 files changed, 48 insertions(+), 53 deletions(-) diff --git a/Source/Core/Core/CoreTiming.cpp b/Source/Core/Core/CoreTiming.cpp index a520fa92f1..73dfd862a7 100644 --- a/Source/Core/Core/CoreTiming.cpp +++ b/Source/Core/Core/CoreTiming.cpp @@ -46,7 +46,7 @@ Common::FifoQueue tsQueue; // event pools Event *eventPool = nullptr; -int downcount, slicelength; +int slicelength; int maxSliceLength = MAX_SLICE_LENGTH; s64 globalTimer; @@ -113,7 +113,7 @@ void UnregisterAllEvents() void Init() { - downcount = maxSliceLength; + PowerPC::ppcState.downcount = maxSliceLength; slicelength = maxSliceLength; globalTimer = 0; idledCycles = 0; @@ -173,7 +173,6 @@ void EventDoState(PointerWrap &p, BaseEvent* ev) void DoState(PointerWrap &p) { std::lock_guard lk(tsWriteLock); - p.Do(downcount); p.Do(slicelength); p.Do(globalTimer); p.Do(idledCycles); @@ -336,10 +335,10 @@ void SetMaximumSlice(int maximumSliceLength) void ForceExceptionCheck(int cycles) { - if (downcount > cycles) + if (PowerPC::ppcState.downcount > cycles) { - slicelength -= (downcount - cycles); // Account for cycles already executed by adjusting the slicelength - downcount = cycles; + slicelength -= (PowerPC::ppcState.downcount - cycles); // Account for cycles already executed by adjusting the slicelength + PowerPC::ppcState.downcount = cycles; } } @@ -390,9 +389,9 @@ void Advance() { MoveEvents(); - int cyclesExecuted = slicelength - downcount; + int cyclesExecuted = slicelength - PowerPC::ppcState.downcount; globalTimer += cyclesExecuted; - downcount = slicelength; + PowerPC::ppcState.downcount = slicelength; while (first) { @@ -414,14 +413,14 @@ void Advance() if (!first) { WARN_LOG(POWERPC, "WARNING - no events in queue. Setting downcount to 10000"); - downcount += 10000; + PowerPC::ppcState.downcount += 10000; } else { slicelength = (int)(first->time - globalTimer); if (slicelength > maxSliceLength) slicelength = maxSliceLength; - downcount = slicelength; + PowerPC::ppcState.downcount = slicelength; } if (advanceCallback) @@ -451,8 +450,8 @@ void Idle() Common::YieldCPU(); } - idledCycles += downcount; - downcount = 0; + idledCycles += PowerPC::ppcState.downcount; + PowerPC::ppcState.downcount = 0; Advance(); } diff --git a/Source/Core/Core/CoreTiming.h b/Source/Core/Core/CoreTiming.h index f064cc10ca..f7b5b3a70d 100644 --- a/Source/Core/Core/CoreTiming.h +++ b/Source/Core/Core/CoreTiming.h @@ -78,7 +78,6 @@ void SetFakeTBStartTicks(u64 val); void ForceExceptionCheck(int cycles); -extern int downcount; extern int slicelength; }; // end of namespace diff --git a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp index b1487435c6..96fada80b0 100644 --- a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp +++ b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp @@ -74,7 +74,7 @@ bool FifoPlayer::Play() { m_CurrentFrame = m_FrameRangeStart; - CoreTiming::downcount = 0; + PowerPC::ppcState.downcount = 0; CoreTiming::Advance(); } else @@ -301,7 +301,7 @@ void FifoPlayer::WriteFifo(u8 *data, u32 start, u32 end) u32 cyclesUsed = elapsedCycles - m_ElapsedCycles; m_ElapsedCycles = elapsedCycles; - CoreTiming::downcount -= cyclesUsed; + PowerPC::ppcState.downcount -= cyclesUsed; CoreTiming::Advance(); } } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter.cpp index 5f1215f06a..6b0c7db93c 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter.cpp @@ -199,7 +199,7 @@ void Interpreter::SingleStep() SingleStepInner(); CoreTiming::slicelength = 1; - CoreTiming::downcount = 0; + PowerPC::ppcState.downcount = 0; CoreTiming::Advance(); if (PowerPC::ppcState.Exceptions) @@ -233,7 +233,7 @@ void Interpreter::Run() // Debugging friendly version of inner loop. Tries to do the timing as similarly to the // JIT as possible. Does not take into account that some instructions take multiple cycles. - while (CoreTiming::downcount > 0) + while (PowerPC::ppcState.downcount > 0) { m_EndBlock = false; int i; @@ -276,13 +276,13 @@ void Interpreter::Run() } SingleStepInner(); } - CoreTiming::downcount -= i; + PowerPC::ppcState.downcount -= i; } } else { // "fast" version of inner loop. well, it's not so fast. - while (CoreTiming::downcount > 0) + while (PowerPC::ppcState.downcount > 0) { m_EndBlock = false; @@ -291,7 +291,7 @@ void Interpreter::Run() { cycles += SingleStepInner(); } - CoreTiming::downcount -= cycles; + PowerPC::ppcState.downcount -= cycles; } } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 757c15c89d..7dac00d435 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -287,7 +287,7 @@ void Jit64::WriteExit(u32 destination) { Cleanup(); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); //If nobody has taken care of this yet (this can be removed when all branches are done) JitBlock *b = js.curBlock; @@ -317,7 +317,7 @@ void Jit64::WriteExitDestInEAX() { MOV(32, M(&PC), R(EAX)); Cleanup(); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -327,7 +327,7 @@ void Jit64::WriteRfiExitDestInEAX() MOV(32, M(&NPC), R(EAX)); Cleanup(); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -337,7 +337,7 @@ void Jit64::WriteExceptionExit() MOV(32, R(EAX), M(&PC)); MOV(32, M(&NPC), R(EAX)); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -347,7 +347,7 @@ void Jit64::WriteExternalExceptionExit() MOV(32, R(EAX), M(&PC)); MOV(32, M(&NPC), R(EAX)); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExternalExceptions)); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index b007095761..c4a5ed44d7 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1726,7 +1726,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { // If a FPU exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. Jit->MOV(32, M(&PC), Imm32(InstLoc)); - Jit->SUB(32, M(&CoreTiming::downcount), Jit->js.downcountAmount > 127 ? Imm32(Jit->js.downcountAmount) : Imm8(Jit->js.downcountAmount)); + Jit->SUB(32, M(&PowerPC::ppcState.downcount), Jit->js.downcountAmount > 127 ? Imm32(Jit->js.downcountAmount) : Imm8(Jit->js.downcountAmount)); Jit->OR(32, M((void *)&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); Jit->WriteExceptionExit(); Jit->SetJumpTarget(b1); diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index f0f1404b19..8c58ae6418 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -388,7 +388,7 @@ void JitIL::WriteExit(u32 destination) if (SConfig::GetInstance().m_LocalCoreStartupParameter.bJITILTimeProfiling) { ABI_CallFunction((void *)JitILProfiler::End); } - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); //If nobody has taken care of this yet (this can be removed when all branches are done) JitBlock *b = js.curBlock; @@ -420,7 +420,7 @@ void JitIL::WriteExitDestInOpArg(const Gen::OpArg& arg) if (SConfig::GetInstance().m_LocalCoreStartupParameter.bJITILTimeProfiling) { ABI_CallFunction((void *)JitILProfiler::End); } - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -433,7 +433,7 @@ void JitIL::WriteRfiExitDestInOpArg(const Gen::OpArg& arg) ABI_CallFunction((void *)JitILProfiler::End); } ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -446,7 +446,7 @@ void JitIL::WriteExceptionExit() MOV(32, R(EAX), M(&PC)); MOV(32, M(&NPC), R(EAX)); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + SUB(32, M(&PowerPC::ppcState.downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index ca9def624d..5d814cedd6 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -134,22 +134,19 @@ void JitArm::Cleanup() void JitArm::DoDownCount() { ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); - MOVI2R(rA, (u32)&CoreTiming::downcount); - LDR(rB, rA); + LDR(rA, R9, PPCSTATE_OFF(downcount)); if (js.downcountAmount < 255) // We can enlarge this if we used rotations { - SUBS(rB, rB, js.downcountAmount); - STR(rB, rA); + SUBS(rA, rA, js.downcountAmount); } else { - ARMReg rC = gpr.GetReg(false); - MOVI2R(rC, js.downcountAmount); - SUBS(rB, rB, rC); - STR(rB, rA); + ARMReg rB = gpr.GetReg(false); + MOVI2R(rB, js.downcountAmount); + SUBS(rA, rA, rB); } - gpr.Unlock(rA, rB); + STR(rA, R9, PPCSTATE_OFF(downcount)); + gpr.Unlock(rA); } void JitArm::WriteExitDestInR(ARMReg Reg) { diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index fae765798e..2ae8edba21 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -95,7 +95,6 @@ void JitArmAsmRoutineManager::Generate() // consumed by CALL. SUB(_SP, _SP, 4); - MOVI2R(R0, (u32)&CoreTiming::downcount); MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]); FixupBranch skipToRealDispatcher = B(); diff --git a/Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp b/Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp index 65f6f44d27..f47b6f2536 100644 --- a/Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp @@ -81,22 +81,19 @@ void JitArmIL::Break(UGeckoInstruction _inst) void JitArmIL::DoDownCount() { - ARMReg rA = R14; - ARMReg rB = R12; - MOVI2R(rA, (u32)&CoreTiming::downcount); - LDR(rB, rA); + ARMReg rA = R12; + LDR(rA, R9, PPCSTATE_OFF(downcount)); if (js.downcountAmount < 255) // We can enlarge this if we used rotations { - SUBS(rB, rB, js.downcountAmount); - STR(rB, rA); + SUBS(rA, rA, js.downcountAmount); } else { - ARMReg rC = R11; - MOVI2R(rC, js.downcountAmount); - SUBS(rB, rB, rC); - STR(rB, rA); + ARMReg rB = R11; + MOVI2R(rB, js.downcountAmount); + SUBS(rA, rA, rB); } + STR(rA, R9, PPCSTATE_OFF(downcount)); } void JitArmIL::WriteExitDestInReg(ARMReg Reg) diff --git a/Source/Core/Core/PowerPC/JitArmIL/JitILAsm.cpp b/Source/Core/Core/PowerPC/JitArmIL/JitILAsm.cpp index 56e2ce1d67..8749e01a42 100644 --- a/Source/Core/Core/PowerPC/JitArmIL/JitILAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArmIL/JitILAsm.cpp @@ -27,7 +27,6 @@ void JitArmILAsmRoutineManager::Generate() // consumed by CALL. SUB(_SP, _SP, 4); - MOVI2R(R0, (u32)&CoreTiming::downcount); MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]); FixupBranch skipToRealDispatcher = B(); diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index 8952c7f346..107c63da95 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -46,6 +46,11 @@ struct GC_ALIGNED64(PowerPCState) // Exception management. volatile u32 Exceptions; + // Downcount for determining when we need to do timing + // This isn't quite the right location for it, but it is here to accelerate the ARM JIT + // This variable should be inside of the CoreTiming namespace if we wanted to be correct. + int downcount; + u32 sr[16]; // Segment registers. u32 DebugCount; diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp index afd56be29e..681411dbb4 100644 --- a/Source/Core/Core/State.cpp +++ b/Source/Core/Core/State.cpp @@ -63,7 +63,7 @@ static Common::Event g_compressAndDumpStateSyncEvent; static std::thread g_save_thread; // Don't forget to increase this after doing changes on the savestate system -static const u32 STATE_VERSION = 26; +static const u32 STATE_VERSION = 27; enum {