A tiny bit more JIT WIP work.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1847 8ced0084-cf51-0410-be5f-012b33b47a6e
2025-03-29 01:20:28 +00:00 · 2009-01-11 01:26:58 +00:00 · 2009-01-11 01:26:58 +00:00 · 018cb993e3
commit 018cb993e3
parent 4acda0096b
2 changed files with 62 additions and 30 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@ -90,16 +90,15 @@ integer code are more aggresively combining blocks and dead condition
 register elimination, which should be very helpful for small blocks.
 TODO (in no particular order):
-Floating-point JIT (both paired and unpaired)
+JIT for misc remaining FP instructions
-	(very large win for FP code, no effect for integer code)
+JIT for bcctrx
-Inter-block dead condition register elimination (Likely significant win
+Misc optimizations for FP instructions
-	combined with optimized conditions)
+Inter-block dead register elimination; this seems likely to have large
-Optimize conditions for conditional branches.
+	performance benefits, although I'm not completely sure.
-General dead register elimination.
+Inter-block inlining; also likely to have large performance benefits.
-Inter-block inlining.
+	The tricky parts are deciding which blocks to inline, and that the
-Track down issues with new JIT + dual-core mode (I think I'm going to
+	IR can't really deal with branches whose destination is in the
-	need help with this one; I'm not very familiar with the
+	the middle of a generated block.
 	dual-core code.)
 Specialized slw/srw/sraw; I think there are some tricks that could
 	have a non-trivial effect, and there are significantly shorter
 	implementations for 64-bit involving abusing 64-bit shifts.
@ -111,15 +110,19 @@ Scheduling to reduce register pressure: PowerPC	compilers like to push
 	instruction reordering.
 Common subexpression elimination
 Optimize load/store of sum using complex addressing (partially implemented)
-Implement idle-skipping
+Loop optimizations (loop-carried registers, LICM)
-Loop optimizations (loop-carried registers, LICM); not sure how much
+Fold register loads into arithmetic operations
 	this will help on top of dead register elimination
 Fold loads (both register and memory) into arithmetic operations
 Code refactoring/cleanup
 Investigate performance of the JIT itself; this doesn't affect
 	framerates significantly, but it does take a visible amount
 	of time for a complicated piece of code like a video decoder
 	to compile.
 Fix profiled loads/stores to work safely.  On 32-bit, one solution is to
 	use a spare segment register, and expand the backpatch solution
 	to work in all the relevant situations.  On 64-bit, the existing
 	fast memory solution should basically work.  An alternative
 	would be to figure out a heuristic for what loads actually
 	vary their "type", and special-case them.
 */
@ -464,6 +467,12 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
 		if (branchValue == 2)
 			return FoldBranchCond(EmitICmpEq(getOp1(getOp1(Op1)),
 					      getOp2(getOp1(Op1))), Op2);
 		if (branchValue == 4)
 			return FoldBranchCond(EmitICmpSgt(getOp1(getOp1(Op1)),
 					      getOp2(getOp1(Op1))), Op2);
 		if (branchValue == 8)
 			return FoldBranchCond(EmitICmpSlt(getOp1(getOp1(Op1)),
 					      getOp2(getOp1(Op1))), Op2);
 	}
 	if (getOpcode(*Op1) == Xor &&
 	    isImm(*getOp2(Op1))) {
@ -475,10 +484,15 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
 			unsigned innerBranchValue = 
 				GetImmValue(getOp2(XOp1));
 			if (branchValue == innerBranchValue) {
-				if (branchValue == 4) {
+				if (branchValue == 2)
 					return FoldBranchCond(EmitICmpNe(getOp1(getOp1(XOp1)),
 						      getOp2(getOp1(XOp1))), Op2);
 				if (branchValue == 4)
 					return FoldBranchCond(EmitICmpSle(getOp1(getOp1(XOp1)),
 						      getOp2(getOp1(XOp1))), Op2);
-				}
+				if (branchValue == 8)
 					return FoldBranchCond(EmitICmpSge(getOp1(getOp1(XOp1)),
 						      getOp2(getOp1(XOp1))), Op2);
 			}
 		}
 	}
@ -493,6 +507,9 @@ InstLoc IRBuilder::FoldICmp(unsigned Opcode, InstLoc Op1, InstLoc Op2) {
 			case ICmpEq:
 				result = GetImmValue(Op1) == GetImmValue(Op2);
 				break;
 			case ICmpNe:
 				result = GetImmValue(Op1) != GetImmValue(Op2);
 				break;
 			case ICmpUgt:
 				result = GetImmValue(Op1) > GetImmValue(Op2);
 				break;
@ -1285,9 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case BranchCond: {
-			unsigned CondOpcode = getOpcode(*getOp1(I));
+			if (isICmp(*getOp1(I)) &&
 			if ((CondOpcode == ICmpEq ||
 			     CondOpcode == ICmpSle) &&
 			    isImm(*getOp2(getOp1(I)))) {
 				regMarkUse(RI, I, getOp1(getOp1(I)), 1);
 			} else {
@ -1904,20 +1919,24 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case BlockEnd:
 			break;
 		case BranchCond: {
-			if (getOpcode(*getOp1(I)) == ICmpEq &&
+			if (isICmp(*getOp1(I)) &&
 			    isImm(*getOp2(getOp1(I)))) {
 				Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
 					 Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
-				FixupBranch cont = Jit->J_CC(CC_NZ);
+				CCFlags flag;
-				regWriteExit(RI, getOp2(I));
+				switch (getOpcode(*getOp1(I))) {
-				Jit->SetJumpTarget(cont);
+					case ICmpEq: flag = CC_NE; break;
-				if (RI.IInfo[I - RI.FirstI] & 4)
+					case ICmpNe: flag = CC_E; break;
-					regClearInst(RI, getOp1(getOp1(I)));
+					case ICmpUgt: flag = CC_BE; break;
-			} else if (getOpcode(*getOp1(I)) == ICmpSle &&
+					case ICmpUlt: flag = CC_AE; break;
-				   isImm(*getOp2(getOp1(I)))) {
+					case ICmpUge: flag = CC_L; break;
-				Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
+					case ICmpUle: flag = CC_A; break;
-					 Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
+					case ICmpSgt: flag = CC_LE; break;
-				FixupBranch cont = Jit->J_CC(CC_G);
+					case ICmpSlt: flag = CC_GE; break;
 					case ICmpSge: flag = CC_L; break;
 					case ICmpSle: flag = CC_G; break;
 				}
 				FixupBranch cont = Jit->J_CC(flag);
 				regWriteExit(RI, getOp2(I));
 				Jit->SetJumpTarget(cont);
 				if (RI.IInfo[I - RI.FirstI] & 4)
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -217,6 +217,10 @@ namespace IREmitter {
 		return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
 	}
 	unsigned inline isICmp(Inst i) {
 		return getOpcode(i) >= ICmpEq && getOpcode(i) <= ICmpSle;
 	}
 	unsigned inline isFResult(Inst i) {
 		return getOpcode(i) > FResult_Start && 
 		       getOpcode(i) < FResult_End;
@ -329,12 +333,21 @@ namespace IREmitter {
 		InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpEq, op1, op2);
 		}
 		InstLoc EmitICmpNe(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpNe, op1, op2);
 		}
 		InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpUgt, op1, op2);
 		}
 		InstLoc EmitICmpSgt(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSgt, op1, op2);
 		}
 		InstLoc EmitICmpSlt(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSlt, op1, op2);
 		}
 		InstLoc EmitICmpSge(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSge, op1, op2);
 		}
 		InstLoc EmitICmpSle(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSle, op1, op2);
 		}