From 94c20db36968bfcef67d43203ad19c4876f439d3 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:03:07 -0700
Subject: [PATCH 01/11] Rename Log2 and add IsPow2 to MathUtils for future use

Also remove unused pow2/pow2f functions.
---
 Source/Core/Common/MathUtil.h                    |  7 +++----
 Source/Core/DolphinWX/GameListCtrl.cpp           |  2 +-
 Source/Core/VideoBackends/OGL/StreamBuffer.cpp   |  2 +-
 .../Core/VideoCommon/TextureConversionShader.cpp |  6 +++---
 Source/UnitTests/Common/MathUtilTest.cpp         | 16 ++++++++--------
 5 files changed, 16 insertions(+), 17 deletions(-)
diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h
index 9c0fe2c884..143398d6f7 100644
--- a/Source/Core/Common/MathUtil.h
+++ b/Source/Core/Common/MathUtil.h
@@ -175,16 +175,15 @@ struct Rectangle
 
 }  // namespace MathUtil
 
-inline float pow2f(float x) {return x * x;}
-inline double pow2(double x) {return x * x;}
-
 float MathFloatVectorSum(const std::vector<float>&);
 
 #define ROUND_UP(x, a)   (((x) + (a) - 1) & ~((a) - 1))
 #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
 
+inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
+
 // Rounds down. 0 -> undefined
-inline int Log2(u64 val)
+inline int IntLog2(u64 val)
 {
 #if defined(__GNUC__)
 	return 63 - __builtin_clzll(val);
diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp
index a7b64e2567..98bc658a12 100644
--- a/Source/Core/DolphinWX/GameListCtrl.cpp
+++ b/Source/Core/DolphinWX/GameListCtrl.cpp
@@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
 	// Find largest power of 2 less than _size.
 	// div 10 to get largest named unit less than _size
 	// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
-	const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
+	const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
 	const u64 unit_size = (1 << (unit * 10));
 
 	// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places
diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
index 5406b6e14c..cc2a3d18a3 100644
--- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
@@ -23,7 +23,7 @@ static u32 genBuffer()
 }
 
 StreamBuffer::StreamBuffer(u32 type, u32 size)
-: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
+: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
 {
 	m_iterator = 0;
 	m_used_iterator = 0;
diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index 76e49ce464..b0215e4c08 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 
 	WRITE(p, "  int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
 	WRITE(p, "  int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
-	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
-	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
+	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
+	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
 	if (samples == 1)
 	{
 		// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 		WRITE(p, "  x_virtual_position = x_virtual_position << 1;\n");
 	}
 	WRITE(p, "  int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
-	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);
+	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
 
 	WRITE(p, "  sampleUv.x = x_offset_in_block + x_block_position;\n");
 	WRITE(p, "  sampleUv.y = y_block_position + y_offset;\n");
diff --git a/Source/UnitTests/Common/MathUtilTest.cpp b/Source/UnitTests/Common/MathUtilTest.cpp
index 8ae757962c..9549039304 100644
--- a/Source/UnitTests/Common/MathUtilTest.cpp
+++ b/Source/UnitTests/Common/MathUtilTest.cpp
@@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
 	EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
 }
 
-TEST(MathUtil, Log2)
+TEST(MathUtil, IntLog2)
 {
-	EXPECT_EQ(0, Log2(1));
-	EXPECT_EQ(1, Log2(2));
-	EXPECT_EQ(2, Log2(4));
-	EXPECT_EQ(3, Log2(8));
-	EXPECT_EQ(63, Log2(0x8000000000000000ull));
+	EXPECT_EQ(0, IntLog2(1));
+	EXPECT_EQ(1, IntLog2(2));
+	EXPECT_EQ(2, IntLog2(4));
+	EXPECT_EQ(3, IntLog2(8));
+	EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
 
 	// Rounding behavior.
-	EXPECT_EQ(3, Log2(15));
-	EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
+	EXPECT_EQ(3, IntLog2(15));
+	EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
 }
 
 TEST(MathUtil, FlushToZero)

From 858296e1c7fe6a492a1e31d532bffea5361b0c39 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:09:10 -0700
Subject: [PATCH 02/11] JIT64: optimize multiplication by immediate constants

Factor out common code and handle a few more common cases.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 140 ++++++++----------
 2 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index de95967df0..1f16d02e45 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -117,6 +117,8 @@ public:
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 	void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
 
+	void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
+
 	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 2e05ce1623..3b64195a38 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -5,6 +5,7 @@
 #include <limits>
 #include <vector>
 
+#include "Common/MathUtil.h"
 #include "Core/PowerPC/Jit64/Jit.h"
 #include "Core/PowerPC/Jit64/JitAsm.h"
 #include "Core/PowerPC/Jit64/JitRegCache.h"
@@ -1005,6 +1006,64 @@ void Jit64::subfx(UGeckoInstruction inst)
 	}
 }
 
+void Jit64::MultiplyImmediate(u32 imm, int a, int d, bool overflow)
+{
+	// simplest cases first
+	if (imm == 0)
+	{
+		XOR(32, gpr.R(d), gpr.R(d));
+		return;
+	}
+
+	if (imm == (u32)-1)
+	{
+		if (d != a)
+			MOV(32, gpr.R(d), gpr.R(a));
+		NEG(32, gpr.R(d));
+		return;
+	}
+
+	// skip these if we need to check overflow flag
+	if (!overflow)
+	{
+		// power of 2; just a shift
+		if (IsPow2(imm))
+		{
+			u32 shift = IntLog2(imm);
+			// use LEA if it saves an op
+			if (d != a && shift <= 3 && shift >= 1 && gpr.R(a).IsSimpleReg())
+			{
+				LEA(32, gpr.RX(d), MScaled(gpr.RX(a), SCALE_1 << shift, 0));
+			}
+			else
+			{
+				if (d != a)
+					MOV(32, gpr.R(d), gpr.R(a));
+				if (shift)
+					SHL(32, gpr.R(d), Imm8(shift));
+			}
+			return;
+		}
+
+		// We could handle factors of 2^N*3, 2^N*5, and 2^N*9 using lea+shl, but testing shows
+		// it seems to be slower overall.
+		static u8 lea_scales[3] = { 3, 5, 9 };
+		for (int i = 0; i < 3; i++)
+		{
+			if (imm == lea_scales[i])
+			{
+				if (d != a)
+					gpr.BindToRegister(a, true, false);
+				LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(a), SCALE_2 << i, 0));
+				return;
+			}
+		}
+	}
+
+	// if we didn't find any better options
+	IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm));
+}
+
 void Jit64::mulli(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@@ -1020,46 +1079,7 @@ void Jit64::mulli(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, d);
 		gpr.BindToRegister(d, (d == a), true);
-		if (imm == 0)
-		{
-			XOR(32, gpr.R(d), gpr.R(d));
-		}
-		else if (imm == (u32)-1)
-		{
-			if (d != a)
-				MOV(32, gpr.R(d), gpr.R(a));
-			NEG(32, gpr.R(d));
-		}
-		else if ((imm & (imm - 1)) == 0)
-		{
-			u32 shift = 0;
-
-			if (imm & 0xFFFF0000)
-				shift |= 16;
-
-			if (imm & 0xFF00FF00)
-				shift |= 8;
-
-			if (imm & 0xF0F0F0F0)
-				shift |= 4;
-
-			if (imm & 0xCCCCCCCC)
-				shift |= 2;
-
-			if (imm & 0xAAAAAAAA)
-				shift |= 1;
-
-			if (d != a)
-				MOV(32, gpr.R(d), gpr.R(a));
-
-			if (shift)
-				SHL(32, gpr.R(d), Imm8(shift));
-		}
-		else
-		{
-			IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm));
-		}
-
+		MultiplyImmediate(imm, a, d, false);
 		gpr.UnlockAll();
 	}
 }
@@ -1087,45 +1107,7 @@ void Jit64::mullwx(UGeckoInstruction inst)
 		{
 			u32 imm = gpr.R(a).IsImm() ? (u32)gpr.R(a).offset : (u32)gpr.R(b).offset;
 			int src = gpr.R(a).IsImm() ? b : a;
-			if (imm == 0)
-			{
-				XOR(32, gpr.R(d), gpr.R(d));
-			}
-			else if (imm == (u32)-1)
-			{
-				if (d != src)
-					MOV(32, gpr.R(d), gpr.R(src));
-				NEG(32, gpr.R(d));
-			}
-			else if ((imm & (imm - 1)) == 0 && !inst.OE)
-			{
-				u32 shift = 0;
-
-				if (imm & 0xFFFF0000)
-					shift |= 16;
-
-				if (imm & 0xFF00FF00)
-					shift |= 8;
-
-				if (imm & 0xF0F0F0F0)
-					shift |= 4;
-
-				if (imm & 0xCCCCCCCC)
-					shift |= 2;
-
-				if (imm & 0xAAAAAAAA)
-					shift |= 1;
-
-				if (d != src)
-					MOV(32, gpr.R(d), gpr.R(src));
-
-				if (shift)
-					SHL(32, gpr.R(d), Imm8(shift));
-			}
-			else
-			{
-				IMUL(32, gpr.RX(d), gpr.R(src), Imm32(imm));
-			}
+			MultiplyImmediate(imm, src, d, inst.OE);
 		}
 		else if (d == a)
 		{

From de662a79b7fce6b309e03b85f5416ac89549db46 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:21:00 -0700
Subject: [PATCH 03/11] JIT64: optimize rlwinmx/rlwinix and friends

Take advantage of movzx as a replacement for anding with 0xff or 0xffff, and
abuse loads from the register cache to save ops.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   4 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 174 +++++++++++-------
 2 files changed, 114 insertions(+), 64 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 1f16d02e45..0b726521a6 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -107,6 +107,10 @@ public:
 	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);
 
+	// Use to extract bytes from a register using the regcache. offset is in bytes.
+	Gen::OpArg ExtractFromReg(int reg, int offset);
+	void AndWithMask(Gen::X64Reg reg, u32 mask);
+
 	// Reads a given bit of a given CR register part.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
 	// Clobbers RDX.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 3b64195a38..90659d9729 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -138,6 +138,30 @@ void Jit64::ComputeRC(const Gen::OpArg & arg)
 	}
 }
 
+OpArg Jit64::ExtractFromReg(int reg, int offset)
+{
+	OpArg src = gpr.R(reg);
+	// store to load forwarding should handle this case efficiently
+	if (offset)
+	{
+		gpr.StoreFromRegister(reg, FLUSH_MAINTAIN_STATE);
+		src = gpr.GetDefaultLocation(reg);
+		src.offset += offset;
+	}
+	return src;
+}
+
+// we can't do this optimization in the emitter because MOVZX and AND have different effects on flags.
+void Jit64::AndWithMask(X64Reg reg, u32 mask)
+ {
+	if (mask == 0xff)
+		MOVZX(32, 8, reg, R(reg));
+	else if (mask == 0xffff)
+		MOVZX(32, 16, reg, R(reg));
+ 	else
+		AND(32, R(reg), Imm32(mask));
+}
+
 // Following static functions are used in conjunction with regimmop
 static u32 Add(u32 a, u32 b)
 {
@@ -1576,49 +1600,57 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		result &= Helper_Mask(inst.MB, inst.ME);
 		gpr.SetImmediate32(a, result);
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(a));
-		}
 	}
 	else
 	{
+		bool left_shift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH;
+		bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
+		u32 mask = Helper_Mask(inst.MB, inst.ME);
+		bool simple_mask = mask == 0xff || mask == 0xffff;
+		int mask_size = inst.ME - inst.MB + 1;
+
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s);
-		if (a != s)
+		if (a != s && left_shift && gpr.R(s).IsSimpleReg() && inst.SH <= 3)
 		{
-			MOV(32, gpr.R(a), gpr.R(s));
+			LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0));
 		}
-
-		if (inst.SH && inst.MB == 0 && inst.ME==31-inst.SH)
+		// common optimized case: byte/word extract
+		else if (simple_mask && !(inst.SH & (mask_size - 1)))
 		{
+			MOVZX(32, mask_size, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0));
+		}
+		// another optimized special case: byte/word extract plus shift
+		else if (((mask >> inst.SH) << inst.SH) == mask && !left_shift &&
+		         ((mask >> inst.SH) == 0xff || (mask >> inst.SH) == 0xffff))
+		{
+			MOVZX(32, mask_size, gpr.RX(a), gpr.R(s));
 			SHL(32, gpr.R(a), Imm8(inst.SH));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
-		}
-		else if (inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH)
-		{
-			SHR(32, gpr.R(a), Imm8(inst.MB));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
 		else
 		{
-			if (inst.SH != 0)
-			{
-				ROL(32, gpr.R(a), Imm8(inst.SH));
-			}
+			if (a != s)
+				MOV(32, gpr.R(a), gpr.R(s));
 
-			if (!(inst.MB==0 && inst.ME==31))
+			if (left_shift)
 			{
-				AND(32, gpr.R(a), Imm32(Helper_Mask(inst.MB, inst.ME)));
-				if (inst.Rc)
-					ComputeRC(gpr.R(a));
+				SHL(32, gpr.R(a), Imm8(inst.SH));
 			}
-			else if (inst.Rc)
+			else if (right_shift)
 			{
-				ComputeRC(gpr.R(a));
+				SHR(32, gpr.R(a), Imm8(inst.MB));
+			}
+			else
+			{
+				if (inst.SH != 0)
+					ROL(32, gpr.R(a), Imm8(inst.SH));
+				if (!(inst.MB == 0 && inst.ME == 31))
+					AndWithMask(gpr.RX(a), mask);
 			}
 		}
+		if (inst.Rc)
+			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();
 	}
 }
@@ -1636,75 +1668,89 @@ void Jit64::rlwimix(UGeckoInstruction inst)
 		u32 mask = Helper_Mask(inst.MB,inst.ME);
 		gpr.SetImmediate32(a, ((u32)gpr.R(a).offset & ~mask) | (_rotl((u32)gpr.R(s).offset,inst.SH) & mask));
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(a));
-		}
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		gpr.BindToRegister(a, true, true);
 		u32 mask = Helper_Mask(inst.MB, inst.ME);
 		if (mask == 0 || (a == s && inst.SH == 0))
 		{
-			if (inst.Rc)
-			{
-				ComputeRC(gpr.R(a));
-			}
+			// nothing to do
 		}
 		else if (mask == 0xFFFFFFFF)
 		{
+			gpr.BindToRegister(a, a == s, true);
 			if (a != s)
-			{
 				MOV(32, gpr.R(a), gpr.R(s));
-			}
-
 			if (inst.SH)
-			{
 				ROL(32, gpr.R(a), Imm8(inst.SH));
-			}
-
-			if (inst.Rc)
-			{
-				ComputeRC(gpr.R(a));
-			}
+		}
+		else if(gpr.R(s).IsImm())
+		{
+			gpr.BindToRegister(a, true, true);
+			AndWithMask(gpr.RX(a), ~mask);
+			OR(32, gpr.R(a), Imm32(_rotl((u32)gpr.R(s).offset, inst.SH) & mask));
 		}
 		else if (inst.SH)
 		{
-			if (mask == 0U - (1U << inst.SH))
+			bool isLeftShift = mask == 0U - (1U << inst.SH);
+			bool isRightShift = mask == (1U << inst.SH) - 1;
+			if (gpr.R(a).IsImm())
 			{
-				MOV(32, R(RSCRATCH), gpr.R(s));
-				SHL(32, R(RSCRATCH), Imm8(inst.SH));
-				AND(32, gpr.R(a), Imm32(~mask));
-				OR(32, gpr.R(a), R(RSCRATCH));
-			}
-			else if (mask == (1U << inst.SH) - 1)
-			{
-				MOV(32, R(RSCRATCH), gpr.R(s));
-				SHR(32, R(RSCRATCH), Imm8(32-inst.SH));
-				AND(32, gpr.R(a), Imm32(~mask));
-				OR(32, gpr.R(a), R(RSCRATCH));
+				u32 maskA = gpr.R(a).offset & ~mask;
+				gpr.BindToRegister(a, false, true);
+				MOV(32, gpr.R(a), gpr.R(s));
+				if (isLeftShift)
+				{
+					SHL(32, gpr.R(a), Imm8(inst.SH));
+				}
+				else if (isRightShift)
+				{
+					SHR(32, gpr.R(a), Imm8(32 - inst.SH));
+				}
+				else
+				{
+					ROL(32, gpr.R(a), Imm8(inst.SH));
+					AND(32, gpr.R(a), Imm32(mask));
+				}
+				OR(32, gpr.R(a), Imm32(maskA));
 			}
 			else
 			{
+				// TODO: common cases of this might be faster with pinsrb or abuse of AH
+				gpr.BindToRegister(a, true, true);
 				MOV(32, R(RSCRATCH), gpr.R(s));
-				ROL(32, R(RSCRATCH), Imm8(inst.SH));
-				XOR(32, R(RSCRATCH), gpr.R(a));
-				AND(32, R(RSCRATCH), Imm32(mask));
-				XOR(32, gpr.R(a), R(RSCRATCH));
+				if (isLeftShift)
+				{
+					SHL(32, R(RSCRATCH), Imm8(inst.SH));
+					AndWithMask(gpr.RX(a), ~mask);
+					OR(32, gpr.R(a), R(RSCRATCH));
+				}
+				else if (isRightShift)
+				{
+					SHR(32, R(RSCRATCH), Imm8(32 - inst.SH));
+					AndWithMask(gpr.RX(a), ~mask);
+					OR(32, gpr.R(a), R(RSCRATCH));
+				}
+				else
+				{
+					ROL(32, R(RSCRATCH), Imm8(inst.SH));
+					XOR(32, R(RSCRATCH), gpr.R(a));
+					AndWithMask(RSCRATCH, mask);
+					XOR(32, gpr.R(a), R(RSCRATCH));
+				}
 			}
-
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
 		else
 		{
+			gpr.BindToRegister(a, true, true);
 			XOR(32, gpr.R(a), gpr.R(s));
-			AND(32, gpr.R(a), Imm32(~mask));
+			AndWithMask(gpr.RX(a), ~mask);
 			XOR(32, gpr.R(a), gpr.R(s));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
+		if (inst.Rc)
+			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();
 	}
 }
@@ -1736,7 +1782,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
 			MOV(32, gpr.R(a), gpr.R(s));
 		}
 		ROL(32, gpr.R(a), R(ECX));
-		AND(32, gpr.R(a), Imm32(mask));
+		AndWithMask(gpr.RX(a), mask);
 		if (inst.Rc)
 			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();

From faf6bdfd96b1953fd6eb40935d9162ccd986902c Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:24:27 -0700
Subject: [PATCH 04/11] JIT64: Optimize cmpXX

Use TEST instead of CMP if we're comparing against 0 (rather common), and
optimize the case of immediate compares further.
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 48 +++++++++++++++----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 90659d9729..ec5f68cd18 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -460,12 +460,13 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 	}
 	else
 	{
+		X64Reg input = RSCRATCH;
 		if (signedCompare)
 		{
 			if (gpr.R(a).IsImm())
-				MOV(64, R(RSCRATCH), Imm32((s32)gpr.R(a).offset));
+				MOV(64, R(input), Imm32((s32)gpr.R(a).offset));
 			else
-				MOVSX(64, 32, RSCRATCH, gpr.R(a));
+				MOVSX(64, 32, input, gpr.R(a));
 
 			if (!comparand.IsImm())
 			{
@@ -476,19 +477,46 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 		else
 		{
 			if (gpr.R(a).IsImm())
-				MOV(32, R(RSCRATCH), Imm32((u32)gpr.R(a).offset));
+			{
+				MOV(32, R(input), Imm32((u32)gpr.R(a).offset));
+			}
+			else if (comparand.IsImm() && !comparand.offset)
+			{
+				gpr.BindToRegister(a, true, false);
+				input = gpr.RX(a);
+			}
 			else
-				MOVZX(64, 32, RSCRATCH, gpr.R(a));
+			{
+				MOVZX(64, 32, input, gpr.R(a));
+			}
 
 			if (comparand.IsImm())
-				MOV(32, R(RSCRATCH2), comparand);
+			{
+				// sign extension will ruin this, so store it in a register
+				if (comparand.offset & 0x80000000U)
+				{
+					MOV(32, R(RSCRATCH2), comparand);
+					comparand = R(RSCRATCH2);
+				}
+			}
 			else
-				MOVZX(64, 32, RSCRATCH2, comparand);
-
-			comparand = R(RSCRATCH2);
+			{
+				gpr.BindToRegister(b, true, false);
+				comparand = gpr.R(b);
+			}
+		}
+		if (comparand.IsImm() && !comparand.offset)
+		{
+			MOV(64, PPCSTATE(cr_val[crf]), R(input));
+			// Place the comparison next to the branch for macro-op fusion
+			if (merge_branch)
+				TEST(64, R(input), R(input));
+		}
+		else
+		{
+			SUB(64, R(input), comparand);
+			MOV(64, PPCSTATE(cr_val[crf]), R(input));
 		}
-		SUB(64, R(RSCRATCH), comparand);
-		MOV(64, PPCSTATE(cr_val[crf]), R(RSCRATCH));
 
 		if (merge_branch)
 		{

From 298f85e15243c562edd984f01a5eb3d8979bd8a2 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:25:56 -0700
Subject: [PATCH 05/11] JIT64: optimize sign-extend

Remove some code duplication.

Also remove some comments that no longer apply since x86_32 was dropped.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |  3 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |  4 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 53 ++++++-------------
 3 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 0b726521a6..a2b261d7b3 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -153,8 +153,7 @@ public:
 	void addmex(UGeckoInstruction inst);
 	void addzex(UGeckoInstruction inst);
 
-	void extsbx(UGeckoInstruction inst);
-	void extshx(UGeckoInstruction inst);
+	void extsXx(UGeckoInstruction inst);
 
 	void sc(UGeckoInstruction _inst);
 	void rfi(UGeckoInstruction _inst);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index fa7c19aec8..927e83353f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -190,8 +190,8 @@ static GekkoOPTemplate table31[] =
 	{0,   &Jit64::cmpXX},                  //"cmp",    OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
 	{32,  &Jit64::cmpXX},                  //"cmpl",   OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
 	{26,  &Jit64::cntlzwx},                //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
-	{922, &Jit64::extshx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
-	{954, &Jit64::extsbx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
+	{922, &Jit64::extsXx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
+	{954, &Jit64::extsXx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index ec5f68cd18..01212e3065 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -796,56 +796,37 @@ void Jit64::boolX(UGeckoInstruction inst)
 	}
 }
 
-void Jit64::extsbx(UGeckoInstruction inst)
+void Jit64::extsXx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, s = inst.RS;
+	int size = inst.SUBOP10 == 922 ? 16 : 8;
 
 	if (gpr.R(s).IsImm())
 	{
-		gpr.SetImmediate32(a, (u32)(s32)(s8)gpr.R(s).offset);
+		gpr.SetImmediate32(a, (u32)(s32)(size == 16 ? (s16)gpr.R(s).offset : (s8)gpr.R(s).offset));
+		if (inst.Rc)
+			ComputeRC(gpr.R(a));
 	}
 	else
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		MOVSX(32, 8, gpr.RX(a), gpr.R(s));
+		// exts is moderately commonly used with inst.Rc, so try to optimize it.
+		if (inst.Rc)
+		{
+			// Only do one movsx; the movzx is free on most modern CPUs.
+			MOVSX(64, size, gpr.RX(a), gpr.R(s));
+			MOV(64, PPCSTATE(cr_val[0]), gpr.R(a));
+			MOVZX(64, 32, gpr.RX(a), gpr.R(a));
+		}
+		else
+		{
+			MOVSX(32, size, gpr.RX(a), gpr.R(s));
+		}
 		gpr.UnlockAll();
 	}
-
-	if (inst.Rc)
-	{
-		ComputeRC(gpr.R(a));
-	}
-}
-
-void Jit64::extshx(UGeckoInstruction inst)
-{
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, s = inst.RS;
-
-	if (gpr.R(s).IsImm())
-	{
-		gpr.SetImmediate32(a, (u32)(s32)(s16)gpr.R(s).offset);
-	}
-	else
-	{
-		gpr.Lock(a, s);
-		gpr.KillImmediate(s, true, false);
-		gpr.BindToRegister(a, a == s, true);
-		// This looks a little dangerous, but it's safe because
-		// every 32-bit register has a 16-bit half at the same index
-		// as the 32-bit register.
-		MOVSX(32, 16, gpr.RX(a), gpr.R(s));
-		gpr.UnlockAll();
-	}
-
-	if (inst.Rc)
-	{
-		ComputeRC(gpr.R(a));
-	}
 }
 
 void Jit64::subfic(UGeckoInstruction inst)

From 9977da0550f7661a2cbe67c3371b123e94f65910 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:26:46 -0700
Subject: [PATCH 06/11] JIT64: avoid using LEA for adds when not necessary

---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 01212e3065..2d7cfedd8e 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1428,13 +1428,6 @@ void Jit64::addx(UGeckoInstruction inst)
 			GenerateConstantOverflow((s64)i + (s64)j);
 		}
 	}
-	else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.Rc && !inst.OE)
-	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-		LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0));
-		gpr.UnlockAll();
-	}
 	else if ((d == a) || (d == b))
 	{
 		int operand = ((d == a) ? b : a);
@@ -1447,6 +1440,15 @@ void Jit64::addx(UGeckoInstruction inst)
 			ComputeRC(gpr.R(d));
 		gpr.UnlockAll();
 	}
+	else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.OE)
+	{
+		gpr.Lock(a, b, d);
+		gpr.BindToRegister(d, false);
+		LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0));
+		if (inst.Rc)
+			ComputeRC(gpr.R(d));
+		gpr.UnlockAll();
+	}
 	else
 	{
 		gpr.Lock(a, b, d);

From 5b7761706952998b0e9844873c0a0a84cf4329a3 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:54:37 -0700
Subject: [PATCH 07/11] JIT64: use LEA for the "a = b + imm" case of addi

---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 2d7cfedd8e..af40bbef78 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -208,8 +208,15 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 		else
 		{
 			gpr.BindToRegister(d, false);
-			MOV(32, gpr.R(d), gpr.R(a));
-			(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+			if (doop == Add && gpr.R(a).IsSimpleReg() && !carry)
+			{
+				LEA(32, gpr.RX(d), MDisp(gpr.RX(a), value));
+			}
+			else
+			{
+				MOV(32, gpr.R(d), gpr.R(a));
+				(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+			}
 			if (carry)
 				GenerateCarry();
 			if (Rc)

From a570c6b4a47accf098d658afe3e429e57742db78 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:29:58 -0700
Subject: [PATCH 08/11] JIT64: tweak srwx/slwx BindToRegister arguments

Register B gets immediately moved into the shift register, so even if a == b
it doesn't need to be loaded.
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index af40bbef78..62e6425d4a 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1861,8 +1861,8 @@ void Jit64::srwx(UGeckoInstruction inst)
 		// no register choice
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));
@@ -1900,8 +1900,8 @@ void Jit64::slwx(UGeckoInstruction inst)
 		// no register choice
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));

From a95d8cbcb482426f36e2cd325f5aba1ee5fbef24 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:35:57 -0700
Subject: [PATCH 09/11] JIT64: optimize carry handling

Carries are rather common and unpredictable, so do them branchlessly wherever
we can.
---
 Source/Core/Core/PowerPC/Gekko.h              |   9 +-
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   4 -
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 260 ++++--------------
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   |   2 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |  17 +-
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |   3 +-
 6 files changed, 84 insertions(+), 211 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h
index 1a9e97b559..3a97d96472 100644
--- a/Source/Core/Core/PowerPC/Gekko.h
+++ b/Source/Core/Core/PowerPC/Gekko.h
@@ -331,9 +331,12 @@ union UFPR
 	float f[2];
 };
 
-#define XER_CA_MASK 0x20000000
-#define XER_OV_MASK 0x40000000
-#define XER_SO_MASK 0x80000000
+#define XER_CA_SHIFT 29
+#define XER_OV_SHIFT 30
+#define XER_SO_SHIFT 31
+#define XER_CA_MASK (1U << XER_CA_SHIFT)
+#define XER_OV_MASK (1U << XER_OV_SHIFT)
+#define XER_SO_MASK (1U << XER_SO_SHIFT)
 // XER
 union UReg_XER
 {
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index a2b261d7b3..ac3defbb9f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -101,10 +101,6 @@ public:
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
 	void FinalizeCarryOverflow(bool oe, bool inv = false);
-	void GetCarryRSCRATCHAndClear();
-	void FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv = false);
-	void GenerateCarry();
-	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);
 
 	// Use to extract bytes from a register using the regcache. offset is in bytes.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 62e6425d4a..b119232bf1 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -31,6 +31,7 @@ void Jit64::GenerateConstantOverflow(bool overflow)
 	}
 }
 
+// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer.
 void Jit64::GenerateOverflow()
 {
 	FixupBranch jno = J_CC(CC_NO);
@@ -49,82 +50,24 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 	// USES_XER
 	if (oe)
 	{
+		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
+		// sides of the branch.
 		FixupBranch jno = J_CC(CC_NO);
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
 		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		// Do carry
-		FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry2);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 		SetJumpTarget(exit);
 	}
 	else
 	{
 		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 	}
 }
 
-void Jit64::GetCarryRSCRATCHAndClear()
-{
-	MOV(32, R(RSCRATCH), PPCSTATE(spr[SPR_XER]));
-	BTR(32, R(RSCRATCH), Imm8(29));
-}
-
-// Assumes that XER is in RSCRATCH and that the CA bit is clear.
-void Jit64::FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv)
-{
-	// USES_XER
-	if (oe)
-	{
-		FixupBranch jno = J_CC(CC_NO);
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(RSCRATCH), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry1);
-		//XER[OV/SO] = 1
-		OR(32, R(RSCRATCH), Imm32(XER_SO_MASK | XER_OV_MASK));
-		FixupBranch exit = J();
-		SetJumpTarget(jno);
-		// Do carry
-		FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(RSCRATCH), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry2);
-		//XER[OV] = 0
-		AND(32, R(RSCRATCH), Imm32(~XER_OV_MASK));
-		SetJumpTarget(exit);
-	}
-	else
-	{
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(RSCRATCH), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry1);
-	}
-	// Dump RSCRATCH back into XER
-	MOV(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH));
-}
-
-// Assumes that the flags were just set through an addition.
-void Jit64::GenerateCarry()
-{
-	// USES_XER
-	FixupBranch pNoCarry = J_CC(CC_NC);
-	OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK));
-	FixupBranch pContinue = J();
-	SetJumpTarget(pNoCarry);
-	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_CA_MASK)));
-	SetJumpTarget(pContinue);
-}
-
 void Jit64::ComputeRC(const Gen::OpArg & arg)
 {
 	if (arg.IsImm())
@@ -153,12 +96,12 @@ OpArg Jit64::ExtractFromReg(int reg, int offset)
 
 // we can't do this optimization in the emitter because MOVZX and AND have different effects on flags.
 void Jit64::AndWithMask(X64Reg reg, u32 mask)
- {
+{
 	if (mask == 0xff)
 		MOVZX(32, 8, reg, R(reg));
 	else if (mask == 0xffff)
 		MOVZX(32, 16, reg, R(reg));
- 	else
+	else
 		AND(32, R(reg), Imm32(mask));
 }
 
@@ -188,22 +131,16 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 	gpr.Lock(d, a);
 	if (a || binary || carry)  // yeh nasty special case addic
 	{
+		if (carry)
+			JitClearCAOV(false);
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
-			if (Rc)
-			{
-				ComputeRC(gpr.R(d));
-			}
 		}
 		else if (a == d)
 		{
 			gpr.KillImmediate(d, true, true);
 			(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
-			if (carry)
-				GenerateCarry();
-			if (Rc)
-				ComputeRC(gpr.R(d));
 		}
 		else
 		{
@@ -217,11 +154,11 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 				MOV(32, gpr.R(d), gpr.R(a));
 				(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
 			}
-			if (carry)
-				GenerateCarry();
-			if (Rc)
-				ComputeRC(gpr.R(d));
 		}
+		if (carry)
+			JitSetCAIf(CC_C);
+		if (Rc)
+			ComputeRC(gpr.R(d));
 	}
 	else if (doop == Add)
 	{
@@ -848,13 +785,11 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCA();
+			JitClearCAOV(false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			FixupBranch carry1 = J_CC(CC_C);
-			JitSetCA();
-			SetJumpTarget(carry1);
+			JitSetCAIf(CC_NC);
 		}
 		else if (imm == -1)
 		{
@@ -864,24 +799,20 @@ void Jit64::subfic(UGeckoInstruction inst)
 		}
 		else
 		{
-			JitClearCA();
+			JitClearCAOV(false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			FixupBranch carry1 = J_CC(CC_NC);
-			JitSetCA();
-			SetJumpTarget(carry1);
+			JitSetCAIf(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCA();
+		JitClearCAOV(false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		FixupBranch carry1 = J_CC(CC_C);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
@@ -926,7 +857,7 @@ void Jit64::subfex(UGeckoInstruction inst)
 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, (d == a || d == b), true);
 
-	GetCarryRSCRATCHAndClear();
+	JitGetAndClearCAOV(inst.OE);
 
 	bool invertedCarry = false;
 	if (d == b)
@@ -947,7 +878,7 @@ void Jit64::subfex(UGeckoInstruction inst)
 		NOT(32, gpr.R(d));
 		ADC(32, gpr.R(d), gpr.R(b));
 	}
-	FinalizeCarryGenerateOverflowRSCRATCH(inst.OE, invertedCarry);
+	FinalizeCarryOverflow(inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -963,14 +894,12 @@ void Jit64::subfmex(UGeckoInstruction inst)
 	gpr.Lock(a, d);
 	gpr.BindToRegister(d, d == a);
 
-	GetCarryRSCRATCHAndClear();
+	JitGetAndClearCAOV(inst.OE);
 	if (d != a)
-	{
 		MOV(32, gpr.R(d), gpr.R(a));
-	}
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
+	FinalizeCarryOverflow(inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -986,14 +915,12 @@ void Jit64::subfzex(UGeckoInstruction inst)
 	gpr.Lock(a, d);
 	gpr.BindToRegister(d, d == a);
 
-	GetCarryRSCRATCHAndClear();
+	JitGetAndClearCAOV(inst.OE);
 	if (d != a)
-	{
 		MOV(32, gpr.R(d), gpr.R(a));
-	}
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
+	FinalizeCarryOverflow(inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -1011,13 +938,9 @@ void Jit64::subfx(UGeckoInstruction inst)
 		s32 i = (s32)gpr.R(b).offset, j = (s32)gpr.R(a).offset;
 		gpr.SetImmediate32(d, i - j);
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(d));
-		}
 		if (inst.OE)
-		{
 			GenerateConstantOverflow((s64)i - (s64)j);
-		}
 	}
 	else
 	{
@@ -1477,31 +1400,22 @@ void Jit64::addex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, b = inst.RB, d = inst.RD;
 
+	gpr.Lock(a, b, d);
+	gpr.BindToRegister(d, (d == a) || (d == b));
+	JitGetAndClearCAOV(inst.OE);
 	if ((d == a) || (d == b))
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryRSCRATCHAndClear();
 		ADC(32, gpr.R(d), gpr.R((d == a) ? b : a));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
 	}
 	else
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryRSCRATCHAndClear();
 		MOV(32, gpr.R(d), gpr.R(a));
 		ADC(32, gpr.R(d), gpr.R(b));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
 	}
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::addcx(UGeckoInstruction inst)
@@ -1543,31 +1457,16 @@ void Jit64::addmex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, d = inst.RD;
 
-	if (d == a)
-	{
-		gpr.Lock(d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryRSCRATCHAndClear();
-		ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
-	else
-	{
-		gpr.Lock(a, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryRSCRATCHAndClear();
+	gpr.Lock(d);
+	gpr.BindToRegister(d, d == a);
+	JitGetAndClearCAOV(inst.OE);
+	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
+	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::addzex(UGeckoInstruction inst)
@@ -1577,31 +1476,16 @@ void Jit64::addzex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, d = inst.RD;
 
-	if (d == a)
-	{
-		gpr.Lock(d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryRSCRATCHAndClear();
-		ADC(32, gpr.R(d), Imm8(0));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
-	else
-	{
-		gpr.Lock(a, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryRSCRATCHAndClear();
+	gpr.Lock(d);
+	gpr.BindToRegister(d, d == a);
+	JitGetAndClearCAOV(inst.OE);
+	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), Imm8(0));
-		FinalizeCarryGenerateOverflowRSCRATCH(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
+	ADC(32, gpr.R(d), Imm8(0));
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::rlwinmx(UGeckoInstruction inst)
@@ -1793,8 +1677,8 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
 		// no register choice
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, (a == s), true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));
@@ -1903,9 +1787,7 @@ void Jit64::slwx(UGeckoInstruction inst)
 		MOV(32, R(ECX), gpr.R(b));
 		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), gpr.R(s));
-		}
 		SHL(64, gpr.R(a), R(ECX));
 		if (inst.Rc)
 		{
@@ -1932,7 +1814,7 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.FlushLockX(ECX);
 	gpr.Lock(a, s, b);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCA();
+	JitClearCAOV(false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
@@ -1941,16 +1823,11 @@ void Jit64::srawx(UGeckoInstruction inst)
 	MOV(32, R(RSCRATCH), gpr.R(a));
 	SHR(64, gpr.R(a), Imm8(32));
 	TEST(32, gpr.R(a), R(RSCRATCH));
-	FixupBranch nocarry = J_CC(CC_Z);
-	JitSetCA();
-	SetJumpTarget(nocarry);
+	JitSetCAIf(CC_NZ);
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
-
 	if (inst.Rc)
-	{
 		ComputeRC(gpr.R(a));
-	}
 }
 
 void Jit64::srawix(UGeckoInstruction inst)
@@ -1964,39 +1841,27 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		JitClearCA();
+		JitClearCAOV(false);
 		MOV(32, R(RSCRATCH), gpr.R(s));
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), R(RSCRATCH));
-		}
 		SAR(32, gpr.R(a), Imm8(amount));
-		if (inst.Rc)
-			ComputeRC(gpr.R(a));
-		SHL(32, R(RSCRATCH), Imm8(32-amount));
+		SHL(32, R(RSCRATCH), Imm8(32 - amount));
 		TEST(32, R(RSCRATCH), gpr.R(a));
-		FixupBranch nocarry = J_CC(CC_Z);
-		JitSetCA();
-		SetJumpTarget(nocarry);
-		gpr.UnlockAll();
+		JitSetCAIf(CC_NZ);
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		JitClearCA();
+		JitClearCAOV(false);
 		gpr.BindToRegister(a, a == s, true);
 
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), gpr.R(s));
-		}
-
-		if (inst.Rc)
-		{
-			ComputeRC(gpr.R(a));
-		}
-		gpr.UnlockAll();
 	}
+	if (inst.Rc)
+		ComputeRC(gpr.R(a));
+	gpr.UnlockAll();
 }
 
 // count leading zeroes
@@ -2032,10 +1897,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst)
 	}
 
 	if (inst.Rc)
-	{
 		ComputeRC(gpr.R(a));
-		// TODO: Check PPC manual too
-	}
 }
 
 void Jit64::twx(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index d266023df5..3874c22a91 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCA();
+			Jit->JitClearCAOV(false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 6b80fd853d..32be48fe0d 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -803,10 +803,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
 	OR(32, PPCSTATE(fpscr), R(RSCRATCH));
 }
 
-
-void EmuCodeBlock::JitClearCA()
+void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
 {
-	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	if (oe)
+		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
+	BTR(32, PPCSTATE(spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
 }
 
 void EmuCodeBlock::JitSetCA()
@@ -814,6 +815,16 @@ void EmuCodeBlock::JitSetCA()
 	OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
 }
 
+// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
+// branchless calculation of CA is probably faster in general.
+void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
+{
+	SETcc(conditionCode, R(RSCRATCH));
+	MOVZX(32, 8, RSCRATCH, R(RSCRATCH));
+	SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+	OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
+}
+
 void EmuCodeBlock::JitClearCAOV(bool oe)
 {
 	if (oe)
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 73eb9ebfe8..e50eedf08f 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -71,8 +71,9 @@ public:
 	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
 
 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
-	void JitClearCA();
+	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
+	void JitSetCAIf(Gen::CCFlags conditionCode);
 	void JitClearCAOV(bool oe);
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);

From b56117de05287f4bf573ed5e6d1de46efe5a3cac Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 28 Aug 2014 10:21:46 -0700
Subject: [PATCH 10/11] JIT64: optimize some special cases of srawix

Shift by 31 and 1, both of which are pretty common, can be done in a few less
instructions. Tested with a hwtest.
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index b119232bf1..a3f0eec8c6 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1841,14 +1841,36 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		JitClearCAOV(false);
 		MOV(32, R(RSCRATCH), gpr.R(s));
 		if (a != s)
 			MOV(32, gpr.R(a), R(RSCRATCH));
-		SAR(32, gpr.R(a), Imm8(amount));
-		SHL(32, R(RSCRATCH), Imm8(32 - amount));
-		TEST(32, R(RSCRATCH), gpr.R(a));
-		JitSetCAIf(CC_NZ);
+		// some optimized common cases that can be done in slightly fewer ops
+		if (amount == 31)
+		{
+			JitSetCA();
+			SAR(32, gpr.R(a), Imm8(31));
+			NEG(32, R(RSCRATCH));                                     // RSCRATCH = input == INT_MIN ? INT_MIN : -input;
+			AND(32, R(RSCRATCH), Imm32(0x80000000));                  // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000
+			SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT));
+			XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN)
+		}
+		else if (amount == 1)
+		{
+			JitClearCAOV(false);
+			SHR(32, R(RSCRATCH), Imm8(31));                          // sign
+			AND(32, R(RSCRATCH), gpr.R(a));                          // (sign && carry)
+			SAR(32, gpr.R(a), Imm8(1));
+			SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+			OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+		}
+		else
+		{
+			JitClearCAOV(false);
+			SAR(32, gpr.R(a), Imm8(amount));
+			SHL(32, R(RSCRATCH), Imm8(32 - amount));
+			TEST(32, R(RSCRATCH), gpr.R(a));
+			JitSetCAIf(CC_NZ);
+		}
 	}
 	else
 	{

From 8fc57d61ba701088c2517001066d24fbc6035b2d Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 17 Aug 2014 23:12:16 -0700
Subject: [PATCH 11/11] JIT64: support merged branching for rlwinmx, too

Not quite as common a branch instruction as cmpwi, but close.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 216 ++++++++++--------
 2 files changed, 120 insertions(+), 98 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index ac3defbb9f..821df102d1 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -106,6 +106,8 @@ public:
 	// Use to extract bytes from a register using the regcache. offset is in bytes.
 	Gen::OpArg ExtractFromReg(int reg, int offset);
 	void AndWithMask(Gen::X64Reg reg, u32 mask);
+	bool CheckMergedBranch(int crf);
+	void DoMergedBranch();
 
 	// Reads a given bit of a given CR register part.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index a3f0eec8c6..5b405de5bb 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -264,6 +264,54 @@ void Jit64::reg_imm(UGeckoInstruction inst)
 	}
 }
 
+bool Jit64::CheckMergedBranch(int crf)
+{
+	const UGeckoInstruction& next = js.next_inst;
+	return (((next.OPCD == 16 /* bcx */) ||
+	        ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
+	        ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) &&
+	         (next.BO & BO_DONT_DECREMENT_FLAG) &&
+	        !(next.BO & BO_DONT_CHECK_CONDITION) &&
+	         (next.BI >> 2) == crf);
+}
+
+void Jit64::DoMergedBranch()
+{
+	// Code that handles successful PPC branching.
+	if (js.next_inst.OPCD == 16) // bcx
+	{
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+
+		u32 destination;
+		if (js.next_inst.AA)
+			destination = SignExt16(js.next_inst.BD << 2);
+		else
+			destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
+		WriteExit(destination);
+	}
+	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
+	{
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+		MOV(32, R(RSCRATCH), M(&CTR));
+		AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
+		WriteExitDestInRSCRATCH();
+	}
+	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
+	{
+		MOV(32, R(RSCRATCH), M(&LR));
+		AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
+		WriteExitDestInRSCRATCH();
+	}
+	else
+	{
+		PanicAlert("WTF invalid branch");
+	}
+}
+
 void Jit64::cmpXX(UGeckoInstruction inst)
 {
 	// USES_CR
@@ -272,23 +320,7 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 	int a = inst.RA;
 	int b = inst.RB;
 	int crf = inst.CRFD;
-
-	bool merge_branch = false;
-	int test_crf = js.next_inst.BI >> 2;
-	// Check if the next instruction is a branch - if it is, merge the two.
-	if (((js.next_inst.OPCD == 16 /* bcx */) ||
-	    ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528) /* bcctrx */) ||
-	    ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16) /* bclrx */)) &&
-	    (js.next_inst.BO & BO_DONT_DECREMENT_FLAG) &&
-	    !(js.next_inst.BO & BO_DONT_CHECK_CONDITION))
-	{
-			// Looks like a decent conditional branch that we can merge with.
-			// It only test CR, not CTR.
-			if (test_crf == crf)
-			{
-				merge_branch = true;
-			}
-	}
+	bool merge_branch = CheckMergedBranch(crf);
 
 	OpArg comparand;
 	bool signedCompare;
@@ -358,47 +390,13 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 			{
 				gpr.Flush();
 				fpr.Flush();
-
-				if (js.next_inst.OPCD == 16) // bcx
-				{
-					if (js.next_inst.LK)
-						MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-
-					u32 destination;
-					if (js.next_inst.AA)
-						destination = SignExt16(js.next_inst.BD << 2);
-					else
-						destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
-					WriteExit(destination);
-				}
-				else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
-				{
-					if (js.next_inst.LK)
-						MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-					MOV(32, R(RSCRATCH), PPCSTATE_CTR);
-					AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
-					WriteExitDestInRSCRATCH();
-				}
-				else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
-				{
-					MOV(32, R(RSCRATCH), PPCSTATE_LR);
-					if (js.next_inst.LK)
-						MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-					WriteExitDestInRSCRATCH();
-				}
-				else
-				{
-					PanicAlert("WTF invalid branch");
-				}
+				DoMergedBranch();
 			}
-			else
+			else if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
 			{
-				if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
-				{
-					gpr.Flush();
-					fpr.Flush();
-					WriteExit(js.next_compilerPC + 4);
-				}
+				gpr.Flush();
+				fpr.Flush();
+				WriteExit(js.next_compilerPC + 4);
 			}
 		}
 	}
@@ -487,51 +485,12 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 			gpr.Flush(FLUSH_MAINTAIN_STATE);
 			fpr.Flush(FLUSH_MAINTAIN_STATE);
 
-			// Code that handles successful PPC branching.
-			if (js.next_inst.OPCD == 16) // bcx
-			{
-				if (js.next_inst.LK)
-					MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-
-				u32 destination;
-				if (js.next_inst.AA)
-					destination = SignExt16(js.next_inst.BD << 2);
-				else
-					destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
-				WriteExit(destination);
-			}
-			else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
-			{
-				if (js.next_inst.LK)
-					MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-
-				MOV(32, R(RSCRATCH), PPCSTATE_CTR);
-				AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
-				WriteExitDestInRSCRATCH();
-			}
-			else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
-			{
-				MOV(32, R(RSCRATCH), PPCSTATE_LR);
-				AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
-
-				if (js.next_inst.LK)
-					MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4));
-
-				WriteExitDestInRSCRATCH();
-			}
-			else
-			{
-				PanicAlert("WTF invalid branch");
-			}
+			DoMergedBranch();
 
 			SetJumpTarget(pDontBranch);
 
 			if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
-			{
-				gpr.Flush();
-				fpr.Flush();
 				WriteExit(js.next_compilerPC + 4);
-			}
 		}
 	}
 
@@ -1494,6 +1453,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA;
 	int s = inst.RS;
+
+	// rlwinm is commonly used as a branch test, second only to the more obvious cmpw.
+	// since it's almost never used with any check other than beq, only support beq for simplicity.
+	bool merge_branch = inst.Rc && CheckMergedBranch(0) && (js.next_inst.BI & 3) == 2;
+
 	if (gpr.R(s).IsImm())
 	{
 		u32 result = (int)gpr.R(s).offset;
@@ -1510,6 +1474,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
 		u32 mask = Helper_Mask(inst.MB, inst.ME);
 		bool simple_mask = mask == 0xff || mask == 0xffff;
+		// in case of a merged branch, track whether or not we've set flags.
+		// if not, we need to do a TEST later to get them.
+		bool needs_test = false;
+		// if we know the high bit can't be set, we can avoid doing a sign extend for flag storage
+		bool needs_sext = true;
 		int mask_size = inst.ME - inst.MB + 1;
 
 		gpr.Lock(a, s);
@@ -1517,11 +1486,14 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		if (a != s && left_shift && gpr.R(s).IsSimpleReg() && inst.SH <= 3)
 		{
 			LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0));
+			needs_test = true;
 		}
 		// common optimized case: byte/word extract
 		else if (simple_mask && !(inst.SH & (mask_size - 1)))
 		{
 			MOVZX(32, mask_size, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0));
+			needs_test = true;
+			needs_sext = false;
 		}
 		// another optimized special case: byte/word extract plus shift
 		else if (((mask >> inst.SH) << inst.SH) == mask && !left_shift &&
@@ -1529,6 +1501,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		{
 			MOVZX(32, mask_size, gpr.RX(a), gpr.R(s));
 			SHL(32, gpr.R(a), Imm8(inst.SH));
+			needs_sext = inst.SH + mask_size >= 32;
 		}
 		else
 		{
@@ -1542,17 +1515,64 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 			else if (right_shift)
 			{
 				SHR(32, gpr.R(a), Imm8(inst.MB));
+				needs_sext = false;
 			}
 			else
 			{
 				if (inst.SH != 0)
 					ROL(32, gpr.R(a), Imm8(inst.SH));
 				if (!(inst.MB == 0 && inst.ME == 31))
-					AndWithMask(gpr.RX(a), mask);
+				{
+					// we need flags if we're merging the branch
+					if (merge_branch)
+						AND(32, gpr.R(a), Imm32(mask));
+					else
+						AndWithMask(gpr.RX(a), mask);
+					needs_sext = inst.MB == 0;
+				}
+				else
+				{
+					needs_test = true;
+				}
 			}
 		}
-		if (inst.Rc)
+		if (merge_branch)
+		{
+			js.downcountAmount++;
+			js.skipnext = true;
+			if (needs_sext)
+			{
+				MOVSX(64, 32, RSCRATCH, gpr.R(a));
+				MOV(64, M(&PowerPC::ppcState.cr_val[0]), R(RSCRATCH));
+			}
+			else
+			{
+				MOV(64, M(&PowerPC::ppcState.cr_val[0]), gpr.R(a));
+			}
+			if (needs_test)
+				TEST(32, gpr.R(a), gpr.R(a));
+
+			gpr.UnlockAll();
+			FixupBranch dont_branch = J_CC((js.next_inst.BO & BO_BRANCH_IF_TRUE) ? CC_NE : CC_E, true);
+
+			gpr.Flush(FLUSH_MAINTAIN_STATE);
+			fpr.Flush(FLUSH_MAINTAIN_STATE);
+
+			DoMergedBranch();
+
+			SetJumpTarget(dont_branch);
+
+			if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
+			{
+				gpr.Flush();
+				fpr.Flush();
+				WriteExit(js.next_compilerPC + 4);
+			}
+		}
+		else if (inst.Rc)
+		{
 			ComputeRC(gpr.R(a));
+		}
 		gpr.UnlockAll();
 	}
 }