Merge pull request #1021 from FioraAeterna/optimizeca3

JIT: Carry optimizations!
2025-02-04 15:40:02 +00:00 · 2014-09-14 15:08:08 -04:00 · 2014-09-14 15:08:08 -04:00 · db7617248f
commit db7617248f
parent 8605a38ba4 08ac10d00a
12 changed files with 361 additions and 317 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, Interpreter::cmpli,        {"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{11, Interpreter::cmpi,         {"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{12, Interpreter::addic,        {"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
-	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
+	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
 	{14, Interpreter::addi,         {"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 	{15, Interpreter::addis,        {"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},

@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
 	{922, Interpreter::extshx,      {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{954, Interpreter::extsbx,      {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{536, Interpreter::srwx,        {"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
+	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{24,  Interpreter::slwx,        {"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},

 	{54,   Interpreter::dcbst,      {"dcbst",  OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
 	{339, Interpreter::mfspr,       {"mfspr",  OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
 	{467, Interpreter::mtspr,       {"mtspr",  OPTYPE_SPR, 0, 2, 0, 0, 0}},
 	{371, Interpreter::mftb,        {"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
-	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
+	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
 	{595, Interpreter::mfsr,        {"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 	{659, Interpreter::mfsrin,      {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},

@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] =
 static GekkoOPTemplate table31_2[] =
 {
 	{266,  Interpreter::addx,        {"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{778,  Interpreter::addx,        {"addox",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{778,  Interpreter::addx,        {"addox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{10,   Interpreter::addcx,       {"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{522,  Interpreter::addcx,       {"addcox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{522,  Interpreter::addcx,       {"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{138,  Interpreter::addex,       {"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{650,  Interpreter::addex,       {"addeox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{650,  Interpreter::addex,       {"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{234,  Interpreter::addmex,      {"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{202,  Interpreter::addzex,      {"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{491,  Interpreter::divwx,       {"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{1003, Interpreter::divwx,       {"divwox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{1003, Interpreter::divwx,       {"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{459,  Interpreter::divwux,      {"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{971,  Interpreter::divwux,      {"divwuox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{971,  Interpreter::divwux,      {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{75,   Interpreter::mulhwx,      {"mulhwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{11,   Interpreter::mulhwux,     {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{235,  Interpreter::mullwx,      {"mullwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
-	{747,  Interpreter::mullwx,      {"mullwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
+	{747,  Interpreter::mullwx,      {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
 	{104,  Interpreter::negx,        {"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{40,   Interpreter::subfx,       {"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{8,    Interpreter::subfcx,      {"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{520,  Interpreter::subfcx,      {"subfcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{520,  Interpreter::subfcx,      {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{136,  Interpreter::subfex,      {"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{232,  Interpreter::subfmex,     {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{200,  Interpreter::subfzex,     {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -178,6 +178,8 @@ void Jit64::Init()
 	code_block.m_gpa = &js.gpa;
 	code_block.m_fpa = &js.fpa;
 	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
+	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
+	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
 }

 void Jit64::ClearCache()
@ -461,6 +463,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 		js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address);

 	js.skipnext = false;
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
 	js.compilerPC = nextPC;
 	// Translate instructions
 	for (u32 i = 0; i < code_block.m_num_instructions; i++)
@ -492,6 +496,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 			// help peephole optimizations
 			js.next_inst = ops[i + 1].inst;
 			js.next_compilerPC = ops[i + 1].address;
+			js.next_op = &ops[i + 1];
 		}

 		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -101,6 +101,8 @@ public:
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
 	void FinalizeCarryOverflow(bool oe, bool inv = false);
+	void FinalizeCarry(Gen::CCFlags cond);
+	void FinalizeCarry(bool ca);
 	void ComputeRC(const Gen::OpArg & arg);

 	// Use to extract bytes from a register using the regcache. offset is in bytes.
@ -139,7 +141,7 @@ public:
 	void DynaRunTable63(UGeckoInstruction _inst);

 	void addx(UGeckoInstruction inst);
-	void addcx(UGeckoInstruction inst);
+	void arithcx(UGeckoInstruction inst);
 	void mulli(UGeckoInstruction inst);
 	void mulhwXx(UGeckoInstruction inst);
 	void mullwx(UGeckoInstruction inst);
@ -147,9 +149,7 @@ public:
 	void divwx(UGeckoInstruction inst);
 	void srawix(UGeckoInstruction inst);
 	void srawx(UGeckoInstruction inst);
-	void addex(UGeckoInstruction inst);
-	void addmex(UGeckoInstruction inst);
-	void addzex(UGeckoInstruction inst);
+	void arithXex(UGeckoInstruction inst);

 	void extsXx(UGeckoInstruction inst);

@ -217,11 +217,7 @@ public:
 	void dcbz(UGeckoInstruction inst);

 	void subfic(UGeckoInstruction inst);
-	void subfcx(UGeckoInstruction inst);
 	void subfx(UGeckoInstruction inst);
-	void subfex(UGeckoInstruction inst);
-	void subfmex(UGeckoInstruction inst);
-	void subfzex(UGeckoInstruction inst);

 	void twx(UGeckoInstruction inst);

--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, &Jit64::cmpXX},                 //"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{11, &Jit64::cmpXX},                 //"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{12, &Jit64::reg_imm},               //"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
-	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
+	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
 	{14, &Jit64::reg_imm},               //"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
 	{15, &Jit64::reg_imm},               //"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},

@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
 	{922, &Jit64::extsXx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{954, &Jit64::extsXx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
+	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
+	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
 	{24,  &Jit64::slwx},                   //"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},

 	{54,   &Jit64::dcbst},                 //"dcbst",  OPTYPE_DCACHE, 0, 4}},
@ -273,7 +273,7 @@ static GekkoOPTemplate table31[] =
 	{339, &Jit64::mfspr},                  //"mfspr",  OPTYPE_SPR, FL_OUT_D}},
 	{467, &Jit64::mtspr},                  //"mtspr",  OPTYPE_SPR, 0, 2}},
 	{371, &Jit64::mftb},                   //"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}},
-	{512, &Jit64::mcrxr},                  //"mcrxr",  OPTYPE_SYSTEM, 0}},
+	{512, &Jit64::mcrxr},                  //"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA}},
 	{595, &Jit64::FallBackToInterpreter},  //"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 2}},
 	{659, &Jit64::FallBackToInterpreter},  //"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 2}},

@ -294,12 +294,12 @@ static GekkoOPTemplate table31_2[] =
 {
 	{266,  &Jit64::addx},                  //"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{778,  &Jit64::addx},                  //"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
-	{10,   &Jit64::addcx},                 //"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{522,  &Jit64::addcx},                 //"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{138,  &Jit64::addex},                 //"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{650,  &Jit64::addex},                 //"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{234,  &Jit64::addmex},                //"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{202,  &Jit64::addzex},                //"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{10,   &Jit64::arithcx},               //"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{522,  &Jit64::arithcx},               //"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{138,  &Jit64::arithXex},              //"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{650,  &Jit64::arithXex},              //"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{234,  &Jit64::arithXex},              //"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{202,  &Jit64::arithXex},              //"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{491,  &Jit64::divwx},                 //"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
 	{1003, &Jit64::divwx},                 //"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
 	{459,  &Jit64::divwux},                //"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
@ -311,11 +311,11 @@ static GekkoOPTemplate table31_2[] =
 	{104,  &Jit64::negx},                  //"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{40,   &Jit64::subfx},                 //"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{552,  &Jit64::subfx},                 //"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
-	{8,    &Jit64::subfcx},                //"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{520,  &Jit64::subfcx},                //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{136,  &Jit64::subfex},                //"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{232,  &Jit64::subfmex},               //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{200,  &Jit64::subfzex},               //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{8,    &Jit64::arithcx},               //"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{520,  &Jit64::arithcx},               //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{136,  &Jit64::arithXex},              //"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{232,  &Jit64::arithXex},              //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{200,  &Jit64::arithXex},              //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 };

 static GekkoOPTemplate table59[] =
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -44,28 +44,76 @@ void Jit64::GenerateOverflow()
 	SetJumpTarget(exit);
 }

+void Jit64::FinalizeCarry(CCFlags cond)
+{
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
+	if (js.op->wantsCA)
+	{
+		if (js.next_op->wantsCAInFlags)
+		{
+			if (cond == CC_C || cond == CC_NC)
+			{
+				js.carryFlagInverted = cond == CC_NC;
+			}
+			else
+			{
+				// convert the condition to a carry flag (is there a better way?)
+				SETcc(cond, R(RSCRATCH));
+				BT(8, R(RSCRATCH), Imm8(0));
+			}
+			js.carryFlagSet = true;
+		}
+		else
+		{
+			JitSetCAIf(cond);
+		}
+	}
+}
+
+// Unconditional version
+void Jit64::FinalizeCarry(bool ca)
+{
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
+	if (js.op->wantsCA)
+	{
+		if (js.next_op->wantsCAInFlags)
+		{
+			if (ca)
+				STC();
+			else
+				CLC();
+			js.carryFlagSet = true;
+		}
+		else if (ca)
+		{
+			JitSetCA();
+		}
+		else
+		{
+			JitClearCAOV(true, false);
+		}
+	}
+}
+
 // Assumes CA,OV are clear
 void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 {
 	// USES_XER
 	if (oe)
 	{
-		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
-		// sides of the branch.
+		// Make sure not to lose the carry flags (not a big deal, this path is rare).
+		PUSHF();
+		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_SO_MASK | XER_OV_MASK)));
 		FixupBranch jno = J_CC(CC_NO);
-		JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
-		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		JitSetCAIf(inv ? CC_NC : CC_C);
-		SetJumpTarget(exit);
-	}
-	else
-	{
-		// Do carry
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		POPF();
 	}
+	// Do carry
+	FinalizeCarry(inv ? CC_NC : CC_C);
 }

 void Jit64::ComputeRC(const Gen::OpArg & arg)
@ -129,10 +177,10 @@ static u32 Xor(u32 a, u32 b)
 void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
 {
 	gpr.Lock(d, a);
-	if (a || binary || carry)  // yeh nasty special case addic
+	// Be careful; addic treats r0 as r0, but addi treats r0 as zero.
+	if (a || binary || carry)
 	{
-		if (carry)
-			JitClearCAOV(false);
+		carry &= js.op->wantsCA;
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
@ -156,7 +204,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 			}
 		}
 		if (carry)
-			JitSetCAIf(CC_C);
+			FinalizeCarry(CC_C);
 		if (Rc)
 			ComputeRC(gpr.R(d));
 	}
@ -239,6 +287,9 @@ void Jit64::reg_imm(UGeckoInstruction inst)

 bool Jit64::CheckMergedBranch(int crf)
 {
+	if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE))
+		return false;
+
 	const UGeckoInstruction& next = js.next_inst;
 	return (((next.OPCD == 16 /* bcx */) ||
 	        ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
@ -721,148 +772,36 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCAOV(false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			JitSetCAIf(CC_NC);
+			FinalizeCarry(CC_NC);
 		}
 		else if (imm == -1)
 		{
-			// CA is always set in this case
-			JitSetCA();
 			NOT(32, gpr.R(d));
+			// CA is always set in this case
+			FinalizeCarry(true);
 		}
 		else
 		{
-			JitClearCAOV(false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			JitSetCAIf(CC_C);
+			FinalizeCarry(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCAOV(false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		JitSetCAIf(CC_NC);
+		FinalizeCarry(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
 }

-void Jit64::subfcx(UGeckoInstruction inst)
-{
-	INSTRUCTION_START;
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
-	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a || d == b), true);
-
-	JitClearCAOV(inst.OE);
-	if (d == b)
-	{
-		SUB(32, gpr.R(d), gpr.R(a));
-	}
-	else if (d == a)
-	{
-		MOV(32, R(RSCRATCH), gpr.R(a));
-		MOV(32, gpr.R(d), gpr.R(b));
-		SUB(32, gpr.R(d), R(RSCRATCH));
-	}
-	else
-	{
-		MOV(32, gpr.R(d), gpr.R(b));
-		SUB(32, gpr.R(d), gpr.R(a));
-	}
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	FinalizeCarryOverflow(inst.OE, true);
-
-	gpr.UnlockAll();
-}
-
-void Jit64::subfex(UGeckoInstruction inst)
-{
-	INSTRUCTION_START;
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
-	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a || d == b), true);
-
-	JitGetAndClearCAOV(inst.OE);
-
-	bool invertedCarry = false;
-	if (d == b)
-	{
-		// Convert carry to borrow
-		CMC();
-		SBB(32, gpr.R(d), gpr.R(a));
-		invertedCarry = true;
-	}
-	else if (d == a)
-	{
-		NOT(32, gpr.R(d));
-		ADC(32, gpr.R(d), gpr.R(b));
-	}
-	else
-	{
-		MOV(32, gpr.R(d), gpr.R(a));
-		NOT(32, gpr.R(d));
-		ADC(32, gpr.R(d), gpr.R(b));
-	}
-	FinalizeCarryOverflow(inst.OE, invertedCarry);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-
-	gpr.UnlockAll();
-}
-
-void Jit64::subfmex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-	gpr.Lock(a, d);
-	gpr.BindToRegister(d, d == a);
-
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	NOT(32, gpr.R(d));
-	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	gpr.UnlockAll();
-}
-
-void Jit64::subfzex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(a, d);
-	gpr.BindToRegister(d, d == a);
-
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	NOT(32, gpr.R(d));
-	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-
-	gpr.UnlockAll();
-}
-
 void Jit64::subfx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@ -1329,96 +1268,93 @@ void Jit64::addx(UGeckoInstruction inst)
 	}
 }

-void Jit64::addex(UGeckoInstruction inst)
+void Jit64::arithXex(UGeckoInstruction inst)
 {
-	// USES_XER
 	INSTRUCTION_START
 	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
+	bool regsource = !(inst.SUBOP10 & 64); // addex or subfex
+	bool mex = !!(inst.SUBOP10 & 32);      // addmex/subfmex or addzex/subfzex
+	bool add = !!(inst.SUBOP10 & 2);       // add or sub
+	int a = inst.RA;
+	int b = regsource ? inst.RB : a;
+	int d = inst.RD;
+	bool same_input_sub = !add && regsource && a == b;

 	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a) || (d == b));
-	JitGetAndClearCAOV(inst.OE);
-	if ((d == a) || (d == b))
+	gpr.BindToRegister(d, !same_input_sub && (d == a || d == b));
+	if (!js.carryFlagSet)
+		JitGetAndClearCAOV(inst.OE);
+
+	bool invertedCarry = false;
+	// Special case: subfe A, B, B is a common compiler idiom
+	if (same_input_sub)
 	{
-		ADC(32, gpr.R(d), gpr.R((d == a) ? b : a));
+		// Convert carry to borrow
+		if (!js.carryFlagInverted)
+			CMC();
+		SBB(32, gpr.R(d), gpr.R(d));
+		invertedCarry = true;
+	}
+	else if (!add && regsource && d == b)
+	{
+		if (!js.carryFlagInverted)
+			CMC();
+		if (d != b)
+			MOV(32, gpr.R(d), gpr.R(b));
+		SBB(32, gpr.R(d), gpr.R(a));
+		invertedCarry = true;
 	}
 	else
 	{
-		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), gpr.R(b));
+		OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0);
+		if (js.carryFlagInverted)
+			CMC();
+		if (d != a && d != b)
+			MOV(32, gpr.R(d), gpr.R(a));
+		if (!add)
+			NOT(32, gpr.R(d));
+		ADC(32, gpr.R(d), source);
 	}
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
 }

-void Jit64::addcx(UGeckoInstruction inst)
+void Jit64::arithcx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITIntegerOff);
+	bool add = !!(inst.SUBOP10 & 2); // add or sub
 	int a = inst.RA, b = inst.RB, d = inst.RD;
+	gpr.Lock(a, b, d);
+	gpr.BindToRegister(d, d == a || d == b, true);

-	if ((d == a) || (d == b))
+	if (d == a && d != b)
 	{
-		int operand = ((d == a) ? b : a);
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, true);
-		JitClearCAOV(inst.OE);
-		ADD(32, gpr.R(d), gpr.R(operand));
-		FinalizeCarryOverflow(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
+		if (add)
+		{
+			ADD(32, gpr.R(d), gpr.R(b));
+		}
+		else
+		{
+			// special case, because sub isn't reversible
+			MOV(32, R(RSCRATCH), gpr.R(a));
+			MOV(32, gpr.R(d), gpr.R(b));
+			SUB(32, gpr.R(d), R(RSCRATCH));
+		}
 	}
 	else
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-		JitClearCAOV(inst.OE);
-		MOV(32, gpr.R(d), gpr.R(a));
-		ADD(32, gpr.R(d), gpr.R(b));
-		FinalizeCarryOverflow(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
+		if (d != b)
+			MOV(32, gpr.R(d), gpr.R(b));
+		if (add)
+			ADD(32, gpr.R(d), gpr.R(a));
+		else
+			SUB(32, gpr.R(d), gpr.R(a));
 	}
-}

-void Jit64::addmex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(d);
-	gpr.BindToRegister(d, d == a);
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	gpr.UnlockAll();
-}
-
-void Jit64::addzex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(d);
-	gpr.BindToRegister(d, d == a);
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(inst.OE, !add);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@ -1811,16 +1747,22 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.FlushLockX(ECX);
 	gpr.Lock(a, s, b);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCAOV(false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
 	SHL(64, gpr.R(a), Imm8(32));
 	SAR(64, gpr.R(a), R(ECX));
-	MOV(32, R(RSCRATCH), gpr.R(a));
-	SHR(64, gpr.R(a), Imm8(32));
-	TEST(32, gpr.R(a), R(RSCRATCH));
-	JitSetCAIf(CC_NZ);
+	if (js.op->wantsCA)
+	{
+		MOV(32, R(RSCRATCH), gpr.R(a));
+		SHR(64, gpr.R(a), Imm8(32));
+		TEST(32, gpr.R(a), R(RSCRATCH));
+	}
+	else
+	{
+		SHR(64, gpr.R(a), Imm8(32));
+	}
+	FinalizeCarry(CC_NZ);
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
 	if (inst.Rc)
@ -1838,41 +1780,50 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		MOV(32, R(RSCRATCH), gpr.R(s));
-		if (a != s)
-			MOV(32, gpr.R(a), R(RSCRATCH));
-		// some optimized common cases that can be done in slightly fewer ops
-		if (amount == 31)
+		if (!js.op->wantsCA)
 		{
-			JitSetCA();
-			SAR(32, gpr.R(a), Imm8(31));
-			NEG(32, R(RSCRATCH));                                     // RSCRATCH = input == INT_MIN ? INT_MIN : -input;
-			AND(32, R(RSCRATCH), Imm32(0x80000000));                  // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000
-			SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT));
-			XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN)
-		}
-		else if (amount == 1)
-		{
-			JitClearCAOV(false);
-			SHR(32, R(RSCRATCH), Imm8(31));                          // sign
-			AND(32, R(RSCRATCH), gpr.R(a));                          // (sign && carry)
-			SAR(32, gpr.R(a), Imm8(1));
-			SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
-			OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+			if (a != s)
+				MOV(32, gpr.R(a), gpr.R(s));
+			SAR(32, gpr.R(a), Imm8(amount));
 		}
 		else
 		{
-			JitClearCAOV(false);
-			SAR(32, gpr.R(a), Imm8(amount));
-			SHL(32, R(RSCRATCH), Imm8(32 - amount));
-			TEST(32, R(RSCRATCH), gpr.R(a));
-			JitSetCAIf(CC_NZ);
+			MOV(32, R(RSCRATCH), gpr.R(s));
+			if (a != s)
+				MOV(32, gpr.R(a), R(RSCRATCH));
+			// some optimized common cases that can be done in slightly fewer ops
+			if (amount == 31)
+			{
+				JitSetCA();
+				SAR(32, gpr.R(a), Imm8(31));
+				NEG(32, R(RSCRATCH));                                     // RSCRATCH = input == INT_MIN ? INT_MIN : -input;
+				AND(32, R(RSCRATCH), Imm32(0x80000000));                  // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000
+				SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT));
+				XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN)
+			}
+			else if (amount == 1)
+			{
+				JitClearCAOV(true, false);
+				SHR(32, R(RSCRATCH), Imm8(31));                          // sign
+				AND(32, R(RSCRATCH), gpr.R(a));                          // (sign && carry)
+				SAR(32, gpr.R(a), Imm8(1));
+				SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+				OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+			}
+			else
+			{
+				JitClearCAOV(true, false);
+				SAR(32, gpr.R(a), Imm8(amount));
+				SHL(32, R(RSCRATCH), Imm8(32 - amount));
+				TEST(32, R(RSCRATCH), gpr.R(a));
+				FinalizeCarry(CC_NZ);
+			}
 		}
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		JitClearCAOV(false);
+		FinalizeCarry(false);
 		gpr.BindToRegister(a, a == s, true);

 		if (a != s)
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCAOV(false);
+			Jit->JitClearCAOV(true, false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@ -81,13 +81,16 @@ protected:
 		bool isLastInstruction;
 		bool memcheck;
 		bool skipnext;
+		bool carryFlagSet;
+		bool carryFlagInverted;

 		int fifoBytesThisBlock;

 		PPCAnalyst::BlockStats st;
 		PPCAnalyst::BlockRegStats gpa;
 		PPCAnalyst::BlockRegStats fpa;
-		PPCAnalyst::CodeOp *op;
+		PPCAnalyst::CodeOp* op;
+		PPCAnalyst::CodeOp* next_op;
 		u8* rewriteStart;

 		JitBlock *curBlock;
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -845,13 +845,14 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
 	SETcc(conditionCode, R(RSCRATCH));
 	MOVZX(32, 8, RSCRATCH, R(RSCRATCH));
 	SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK));
 	OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
 }

-void EmuCodeBlock::JitClearCAOV(bool oe)
+void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
 {
-	if (oe)
-		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
-	else
-		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
+	if (mask == 0xFFFFFFFF)
+		return;
+	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(mask));
 }
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -111,7 +111,7 @@ public:
 	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
 	void JitSetCAIf(Gen::CCFlags conditionCode);
-	void JitClearCAOV(bool oe);
+	void JitClearCAOV(bool ca, bool oe);

 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
 	func->flags = flags;
 }

-// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
 static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 {
+	const GekkoOPInfo *a_info = a.opinfo;
 	const GekkoOPInfo *b_info = b.opinfo;
+	int a_flags = a_info->flags;
 	int b_flags = b_info->flags;
-	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
+	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
 		return false;
-	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
+	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
+		return false;
+	if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
 		return false;

 	switch (b.inst.OPCD)
@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 	{
 		int regInA = a.regsIn[j];
 		int regInB = b.regsIn[j];
-		if (regInA >= 0 &&
-			(b.regsOut[0] == regInA ||
-			 b.regsOut[1] == regInA))
-		{
-			// reg collision! don't swap
+		// register collision: b outputs to one of a's inputs
+		if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
 			return false;
-		}
-		if (regInB >= 0 &&
-			(a.regsOut[0] == regInB ||
-			 a.regsOut[1] == regInB))
-		{
-			// reg collision! don't swap
+		// register collision: a outputs to one of b's inputs
+		if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
 			return false;
-		}
+		// register collision: b outputs to one of a's outputs (overwriting it)
+		for (int k = 0; k < 2; k++)
+			if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
+				return false;
 	}

 	return true;
@ -403,34 +402,84 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
 		leafSize, niceSize, unniceSize);
 }

-void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+static bool isCmp(const CodeOp& a)
 {
-	// Instruction Reordering Pass
-	// Bubble down compares towards branches, so that they can be merged.
-	// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
-	for (u32 i = 0; i < (instructions - 2); ++i)
+	return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
+}
+
+static bool isRlwinm_rc(const CodeOp& a)
+{
+	return a.inst.OPCD == 21 && a.inst.Rc;
+}
+
+static bool isCarryOp(const CodeOp& a)
+{
+	return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
+}
+
+void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
+{
+	// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
+	// multiple passes.
+	while (true)
 	{
-		CodeOp &a = code[i];
-		CodeOp &b = code[i + 1];
-		// All integer compares can be reordered.
-		if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
-			(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
+		// Instruction Reordering Pass
+		// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
+		// storing the carry flag.
+		// Compare pass: bubble compare instructions next to branches, so they can be merged.
+		bool swapped = false;
+		int increment = reverse ? -1 : 1;
+		int start = reverse ? instructions - 1 : 0;
+		int end = reverse ? 0 : instructions - 1;
+		for (int i = start; i != end; i += increment)
 		{
-			// Got a compare instruction.
-			if (CanSwapAdjacentOps(a, b))
+			CodeOp &a = code[i];
+			CodeOp &b = code[i + increment];
+			// Reorder integer compares, rlwinm., and carry-affecting ops
+			// (if we add more merged branch instructions, add them here!)
+			if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
 			{
-				// Alright, let's bubble it down!
-				std::swap(a, b);
+				// once we're next to a carry instruction, don't move away!
+				if (type == REORDER_CARRY && i != start)
+				{
+					// if we read the CA flag, and the previous instruction sets it, don't move away.
+					if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
+						continue;
+					// if we set the CA flag, and the next instruction reads it, don't move away.
+					if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
+						continue;
+				}
+
+				if (CanSwapAdjacentOps(a, b))
+				{
+					// Alright, let's bubble it!
+					std::swap(a, b);
+					swapped = true;
+				}
 			}
 		}
+		if (!swapped)
+			return;
 	}
 }

+void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+{
+	// For carry, bubble instructions *towards* each other; one direction often isn't enough
+	// to get pairs like addc/adde next to each other.
+	if (HasOption(OPTION_CARRY_MERGE))
+	{
+		ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
+		ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
+	}
+	if (HasOption(OPTION_BRANCH_MERGE))
+		ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
+}
+
 void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
 {
 	code->wantsCR0 = false;
 	code->wantsCR1 = false;
-	code->wantsPS1 = false;

 	if (opinfo->flags & FL_USE_FPU)
 		block->m_fpa->any = true;
@ -458,6 +507,24 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
 	code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;

+	code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
+	code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
+
+	// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
+	// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
+	// leave it in flags.
+	if (HasOption(OPTION_CARRY_MERGE))
+		code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
+	else
+		code->wantsCAInFlags = false;
+
+	// mfspr/mtspr can affect/use XER, so be super careful here
+	// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
+		code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
+		code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+
 	int numOut = 0;
 	int numIn = 0;
 	if (opinfo->flags & FL_OUT_A)
@ -715,26 +782,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		block->m_broken = true;
 	}

-	// Scan for CR0 dependency
-	// assume next block wants flags to be safe
+	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
+	// wants flags, to be safe.
 	bool wantsCR0 = true;
 	bool wantsCR1 = true;
-	bool wantsPS1 = true;
 	bool wantsFPRF = true;
+	bool wantsCA = true;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
-		wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
-		wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
-		wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
-		code[i].wantsCR0 = wantsCR0;
-		code[i].wantsCR1 = wantsCR1;
-		code[i].wantsPS1 = wantsPS1;
-		code[i].wantsFPRF = wantsFPRF;
-		wantsCR0 &= !code[i].outputCR0;
-		wantsCR1 &= !code[i].outputCR1;
-		wantsPS1 &= !code[i].outputPS1;
-		wantsFPRF &= !code[i].outputFPRF;
+		bool opWantsCR0  = code[i].wantsCR0;
+		bool opWantsCR1  = code[i].wantsCR1;
+		bool opWantsFPRF = code[i].wantsFPRF;
+		bool opWantsCA   = code[i].wantsCA;
+		code[i].wantsCR0  = wantsCR0  || code[i].canEndBlock;
+		code[i].wantsCR1  = wantsCR1  || code[i].canEndBlock;
+		code[i].wantsFPRF = wantsFPRF || code[i].canEndBlock;
+		code[i].wantsCA   = wantsCA   || code[i].canEndBlock;
+		wantsCR0  |= opWantsCR0  || code[i].canEndBlock;
+		wantsCR1  |= opWantsCR1  || code[i].canEndBlock;
+		wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
+		wantsCA   |= opWantsCA   || code[i].canEndBlock;
+		wantsCR0  &= !code[i].outputCR0  || opWantsCR0;
+		wantsCR1  &= !code[i].outputCR1  || opWantsCR1;
+		wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
+		wantsCA   &= !code[i].outputCA   || opWantsCA;
 	}
 	return address;
 }
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -33,12 +33,13 @@ struct CodeOp //16B
 	bool isBranchTarget;
 	bool wantsCR0;
 	bool wantsCR1;
-	bool wantsPS1;
 	bool wantsFPRF;
+	bool wantsCA;
+	bool wantsCAInFlags;
 	bool outputCR0;
 	bool outputCR1;
-	bool outputPS1;
 	bool outputFPRF;
+	bool outputCA;
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 };
@ -143,6 +144,13 @@ class PPCAnalyzer
 {
 private:

+	enum ReorderType
+	{
+		REORDER_CARRY,
+		REORDER_CMP
+	};
+
+	void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
 	void ReorderInstructions(u32 instructions, CodeOp *code);
 	void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);

@ -175,6 +183,14 @@ public:
 		// Requires JIT support to work.
 		// XXX: NOT COMPLETE
 		OPTION_FORWARD_JUMP = (1 << 3),
+
+		// Reorder compare/Rc instructions next to their associated branches and
+		// merge in the JIT (for common cases, anyway).
+		OPTION_BRANCH_MERGE = (1 << 4),
+
+		// Reorder carry instructions next to their associated branches and pass
+		// carry flags in the x86 flags between them, instead of in XER.
+		OPTION_CARRY_MERGE = (1 << 5),
 	};


--- a/Source/Core/Core/PowerPC/PPCTables.h
+++ b/Source/Core/Core/PowerPC/PPCTables.h
@ -38,6 +38,7 @@ enum
 	FL_LOADSTORE       = (1<<19),
 	FL_SET_FPRF        = (1<<20),
 	FL_READ_FPRF       = (1<<21),
+	FL_SET_OE          = (1<<22),
 };

 enum