diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index 9271532cb5..3545435881 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -78,14 +78,30 @@ on the test I've been working on (which bounded by JIT performance and doesn't use any floating-point), it's roughly 25% faster than the current JIT, with the edge over the current JIT mostly due to the fast memory optimization. +Update on perf: +I've been doing a bit more tweaking for a small perf improvement (in the +range of 5-10%). That said, it's getting to the point where I'm simply +not seeing potential for improvements to codegen, at least for long, +straightforward blocks. For one long block that's at the top of my samples, +I've managed to get the bloat% (number of instructions compared to PPC +equivalent) down to 225%, and I can't really see it going down much further. +It looks like the most promising paths to further improvement for pure +integer code are more aggresively combining blocks and dead condition +register elimination, which should be very helpful for small blocks. + TODO (in no particular order): -Floating-point JIT (both paired and unpaired): currently falls back - to the interpreter +Floating-point JIT (both paired and unpaired) + (very large win for FP code, no effect for integer code) +Inter-block dead condition register elimination (Likely significant win + combined with optimized conditions) Optimize conditions for conditional branches. -Inter-block dead register elimination, especially for CR0. +General dead register elimination. Inter-block inlining. -Track down a few correctness bugs. -Implement a select instruction +Track down a few correctness bugs (I think there's something wrong + with my branches, but I haven't been able to figure it out). +Specialized slw/srw/sraw; I think there are some tricks that could + have a non-trivial effect, and there are significantly shorter + implementations for 64-bit involving abusing 64-bit shifts. 64-bit compat (it should only be a few tweaks to register allocation and the load/store code) Scheduling to reduce register pressure: PowerPC compilers like to push @@ -93,8 +109,16 @@ Scheduling to reduce register pressure: PowerPC compilers like to push x86 processors, which are short on registers and extremely good at instruction reordering. Common subexpression elimination -Optimize load of sum using complex addressing (partially implemented) +Optimize load/store of sum using complex addressing (partially implemented) Implement idle-skipping +Loop optimizations (loop-carried registers, LICM); not sure how much + this will help on top of dead register elimination +Fold loads (both register and memory) into arithmetic operations +Code refactoring/cleanup +Investigate performance of the JIT itself; this doesn't affect + framerates significantly, but it does take a visible amount + of time for a complicated piece of code like a video decoder + to compile. */ @@ -492,6 +516,9 @@ struct RegInfo { exitNumber = 0; MakeProfile = UseProfile = false; } + + private: + RegInfo(RegInfo&); // DO NOT IMPLEMENT }; static void regMarkUse(RegInfo& R, InstLoc I, InstLoc Op, unsigned OpNum) { @@ -635,48 +662,119 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I, RI.regs[reg] = I; } -static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) { - X64Reg reg; - unsigned offset; - - if (getOpcode(*getOp1(I)) == Add && isImm(*getOp2(getOp1(I)))) { - offset = RI.Build->GetImmValue(getOp2(getOp1(I))); - reg = regBinLHSReg(RI, getOp1(I)); - if (RI.IInfo[I - RI.FirstI] & 4) - regClearInst(RI, getOp1(getOp1(I))); - } else { - offset = 0; - reg = regBinLHSReg(RI, I); - if (RI.IInfo[I - RI.FirstI] & 4) - regClearInst(RI, getOp1(I)); +// Mark and calculation routines for profiled load/store addresses +// Could be extended to unprofiled addresses. +// FIXME: Finish/activate! +static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) { + if (isImm(*AI)) { + unsigned addr = RI.Build->GetImmValue(AI); + if (Memory::IsRAMAddress(addr)) + return; } - if (RI.UseProfile) { - unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++]; - if (!(curLoad & 0x0C000000)) { - if (regReadUse(RI, I)) { - unsigned addr = (u32)Memory::base - (curLoad & 0xC0000000) + offset; - RI.Jit->MOVZX(32, Size, reg, MDisp(reg, addr)); - RI.Jit->BSWAP(Size, reg); - RI.regs[reg] = I; - } + if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) { + regMarkUse(RI, I, getOp1(AI), OpNum); + return; + } + regMarkUse(RI, I, AI, OpNum); +} + +static void regClearDeadMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) { + if (!(RI.IInfo[I - RI.FirstI] & (2 << OpNum))) + return; + if (isImm(*AI)) { + unsigned addr = RI.Build->GetImmValue(AI); + if (Memory::IsRAMAddress(addr)) { return; } } - if (offset) { - RI.Jit->ADD(32, R(reg), Imm32(offset)); + InstLoc AddrBase; + if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) { + AddrBase = getOp1(AI); + } else { + AddrBase = AI; } + regClearInst(RI, AddrBase); +} + +static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, + unsigned OpNum, unsigned Size, X64Reg* dest, + bool Profiled, + unsigned ProfileOffset = 0) { + if (isImm(*AI)) { + unsigned addr = RI.Build->GetImmValue(AI); + if (Memory::IsRAMAddress(addr)) { + if (dest) + *dest = regFindFreeReg(RI); + if (Profiled) + return M((void*)((u32)Memory::base + (addr & Memory::MEMVIEW32_MASK))); + return M((void*)addr); + } + } + unsigned offset; + InstLoc AddrBase; + if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) { + offset = RI.Build->GetImmValue(getOp2(AI)); + AddrBase = getOp1(AI); + } else { + offset = 0; + AddrBase = AI; + } + X64Reg baseReg; + if (RI.IInfo[I - RI.FirstI] & (2 << OpNum)) { + baseReg = regEnsureInReg(RI, AddrBase); + regClearInst(RI, AddrBase); + if (dest) + *dest = baseReg; + } else if (dest) { + X64Reg reg = regFindFreeReg(RI); + if (!regLocForInst(RI, AddrBase).IsSimpleReg()) { + RI.Jit->MOV(32, R(reg), regLocForInst(RI, AddrBase)); + baseReg = reg; + } else { + baseReg = regLocForInst(RI, AddrBase).GetSimpleReg(); + } + *dest = reg; + } else { + baseReg = regEnsureInReg(RI, AddrBase); + } + + if (Profiled) { + return MDisp(baseReg, (u32)Memory::base + offset + ProfileOffset); + } + return MDisp(baseReg, offset); +} +// end FIXME + +static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) { + if (RI.UseProfile) { + unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++]; + if (!(curLoad & 0x0C000000)) { + X64Reg reg; + OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1, + Size, ®, true, + -(curLoad & 0xC0000000)); + RI.Jit->MOVZX(32, Size, reg, addr); + RI.Jit->BSWAP(Size, reg); + if (regReadUse(RI, I)) + RI.regs[reg] = I; + return; + } + } + X64Reg reg; + OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1, Size, ®, false); + RI.Jit->LEA(32, ECX, addr); if (RI.MakeProfile) { - RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(reg)); + RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX)); } - RI.Jit->TEST(32, R(reg), Imm32(0x0C000000)); + RI.Jit->TEST(32, R(ECX), Imm32(0x0C000000)); FixupBranch argh = RI.Jit->J_CC(CC_Z); if (reg != EAX) RI.Jit->PUSH(32, R(EAX)); switch (Size) { - case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break; - case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break; - case 8: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break; + case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), ECX); break; + case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), ECX); break; + case 8: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), ECX); break; } if (reg != EAX) { RI.Jit->MOV(32, R(reg), R(EAX)); @@ -684,41 +782,87 @@ static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) { } FixupBranch arg2 = RI.Jit->J(); RI.Jit->SetJumpTarget(argh); - RI.Jit->UnsafeLoadRegToReg(reg, reg, Size, 0, false); + RI.Jit->UnsafeLoadRegToReg(ECX, reg, Size, 0, false); RI.Jit->SetJumpTarget(arg2); if (regReadUse(RI, I)) RI.regs[reg] = I; } +static OpArg regSwappedImmForConst(RegInfo& RI, InstLoc I, unsigned Size) { + unsigned imm = RI.Build->GetImmValue(I); + if (Size == 32) { + imm = Common::swap32(imm); + return Imm32(imm); + } else if (Size == 16) { + imm = Common::swap16(imm); + return Imm16(imm); + } else { + return Imm8(imm); + } +} + +static OpArg regImmForConst(RegInfo& RI, InstLoc I, unsigned Size) { + unsigned imm = RI.Build->GetImmValue(I); + if (Size == 32) { + return Imm32(imm); + } else if (Size == 16) { + return Imm16(imm); + } else { + return Imm8(imm); + } +} + static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) { if (RI.UseProfile) { unsigned curStore = ProfiledLoads[RI.numProfiledLoads++]; if (!(curStore & 0x0C000000)) { - X64Reg reg = regEnsureInReg(RI, getOp2(I)); - RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); - RI.Jit->BSWAP(Size, ECX); - unsigned addr = (u32)Memory::base - (curStore & 0xC0000000); - RI.Jit->MOV(Size, MDisp(reg, addr), R(ECX)); + OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2, + Size, 0, true, + -(curStore & 0xC0000000)); + if (isImm(*getOp1(I))) { + RI.Jit->MOV(Size, addr, regSwappedImmForConst(RI, getOp1(I), Size)); + } else { + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->BSWAP(Size, ECX); + RI.Jit->MOV(Size, addr, R(ECX)); + } + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, getOp1(I)); return; } else if ((curStore & 0xFFFFF000) == 0xCC008000) { regSpill(RI, EAX); - RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); - RI.Jit->BSWAP(Size, ECX); + if (isImm(*getOp1(I))) { + RI.Jit->MOV(Size, R(ECX), regSwappedImmForConst(RI, getOp1(I), Size)); + } else { + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->BSWAP(Size, ECX); + } RI.Jit->MOV(32, R(EAX), M(&GPFifo::m_gatherPipeCount)); RI.Jit->MOV(Size, MDisp(EAX, (u32)GPFifo::m_gatherPipe), R(ECX)); RI.Jit->ADD(32, R(EAX), Imm8(Size >> 3)); RI.Jit->MOV(32, M(&GPFifo::m_gatherPipeCount), R(EAX)); RI.Jit->js.fifoBytesThisBlock += Size >> 3; + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, getOp1(I)); + //regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false); + regClearDeadMemAddress(RI, I, getOp2(I), 2); return; } } + OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false); + RI.Jit->LEA(32, ECX, addr); regSpill(RI, EAX); - RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I))); - RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + if (isImm(*getOp1(I))) { + RI.Jit->MOV(Size, R(EAX), regImmForConst(RI, getOp1(I), Size)); + } else { + RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I))); + } if (RI.MakeProfile) { RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX)); } RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0); + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, getOp1(I)); } static void regEmitShiftInst(RegInfo& RI, InstLoc I, @@ -787,7 +931,6 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { RI.Build = ibuild; RI.UseProfile = UseProfile; RI.MakeProfile = !RI.UseProfile; - unsigned bs = Jit->js.blockStart; // Pass to compute liveness ibuild->StartBackPass(); for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) { @@ -825,21 +968,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { if (thisUsed) regMarkUse(RI, I, getOp1(I), 1); break; - case StoreCR: - case StoreCarry: case Load8: case Load16: case Load32: - if (getOpcode(*getOp1(I)) == Add && - isImm(*getOp2(getOp1(I)))) { - regMarkUse(RI, I, getOp1(getOp1(I)), 1); - break; - } + regMarkMemAddress(RI, I, getOp1(I), 1); + break; + case StoreCR: + case StoreCarry: + regMarkUse(RI, I, getOp1(I), 1); + break; case StoreGReg: case StoreLink: case StoreCTR: case StoreMSR: - regMarkUse(RI, I, getOp1(I), 1); + if (!isImm(*getOp1(I))) + regMarkUse(RI, I, getOp1(I), 1); break; case Add: case Sub: @@ -866,8 +1009,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case Store8: case Store16: case Store32: - regMarkUse(RI, I, getOp1(I), 1); - regMarkUse(RI, I, getOp2(I), 2); + if (!isImm(*getOp1(I))) + regMarkUse(RI, I, getOp1(I), 1); + regMarkMemAddress(RI, I, getOp2(I), 2); break; case BranchUncond: if (!isImm(*getOp1(I))) @@ -1238,7 +1382,11 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { getOpcode(*I) != BranchCond && getOpcode(*I) != Load8 && getOpcode(*I) != Load16 && - getOpcode(*I) != Load32) { + getOpcode(*I) != Load32 && + getOpcode(*I) != Store8 && + getOpcode(*I) != Store16 && + getOpcode(*I) != Store32 && + 1) { if (RI.IInfo[I - RI.FirstI] & 4) regClearInst(RI, getOp1(I)); if (RI.IInfo[I - RI.FirstI] & 8) @@ -1252,7 +1400,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { } } - if (RI.numSpills) + if (UseProfile && RI.numSpills) printf("Block: %x, numspills %d\n", Jit->js.blockStart, RI.numSpills); Jit->UD2(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index a7a1a47563..74bfaadd0f 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -80,6 +80,69 @@ namespace IREmitter { Store16, Store32, BranchCond, + // Floating-point + // There are three floating-point formats: single, double, + // and packed. For any operation where the format of the + // operand isn't known, the ForceTo* operations are used; + // these are folded into the appropriate conversion + // (or no conversion) depending on the type of the operand. + // The "mreg" format is a pair of doubles; this is the + // most general possible represenation which is used + // in the register state. + // This might seem like overkill, but it's a huge advantage + // to keep operands in the right format because extra + // precision can screw up games. + // FIXME: Does the slight loss of precision due to not + // having a madd instruction matter? It would be a + // performance loss for singles because the operations + // would have to be done in double precision, and a completely + // accurate double madd would require an extremely expensive + // fallback. + FDAdd, + FDSub, + FDMul, + FDDiv, + FDNeg, + FSAdd, + FSSub, + FSMul, + FSDiv, + FSNeg, + FPSAdd, + FPSSub, + FPSMul, + FPSDiv, + FPSNeg, + // FP Loads + LoadSingle, + LoadDouble, + // LoadPacked, // FIXME: Work out how this instruction should + // be implemented + // FP Stores + StoreSingle, + StoreDouble, + // StorePacked, // FIXME: Work out how this instruction should + // be implemented + PackedToSingle, // Extract PS0 from packed (type-pun) + // PackedToDouble == PackedToSingle+SingleToDouble + PackedToMReg, // Convert from packed format to mreg format (CVTPS2PD) + SingleToDouble, // Widen single to double (CVTSS2SD) + SingleToPacked, // Duplicate single to packed + // SingleToMReg == SingleToPacked+PackedToMReg + MRegToPacked, // Convert from mreg format to packed format (CVTPD2PS) + MRegToDouble, // Extract bottom half from mreg format. (type-pun) + // MRegToSingle == MRegToDouble + DoubleToSingle + DoubleToMReg, // Convert from double format to mreg format + DoubleToSingle, // Convert from double to single format (CVTSD2SS) + // DoubleToPacked should never be needed + + ForceToPacked, // ForceTo* are "virtual"; they should be + // folded into the above conversions. + ForceToSingle, + ForceToDouble, + ForceToMReg, + LoadFPReg, + StoreFPReg, // "Trinary" operators // FIXME: Need to change representation! @@ -330,6 +393,7 @@ namespace IREmitter { IRBuilder() { Reset(); } private: + IRBuilder(IRBuilder&); // DO NOT IMPLEMENT std::vector InstList; // FIXME: We must ensure this is // continuous! std::vector ConstList; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp index 09596548df..61868d80a6 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp @@ -81,14 +81,10 @@ using namespace Gen; CRTest = ibuild.EmitXor(CRTest, CRCmp); } - if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) { + if ((inst.BO & 4) == 0) { IREmitter::InstLoc c = ibuild.EmitLoadCTR(); c = ibuild.EmitSub(c, ibuild.EmitIntConst(1)); ibuild.EmitStoreCTR(c); - } - - if ((inst.BO & 4) == 0) { - IREmitter::InstLoc c = ibuild.EmitLoadCTR(); if (!(inst.BO & 2)) { CTRTest = ibuild.EmitICmpEq(c, ibuild.EmitIntConst(0));