From b51aa4fa89c819f12c718a794ee8a62969e8b2d2 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:03:07 -0700
Subject: [PATCH 01/13] Rename Log2 and add IsPow2 to MathUtils for future use

Also remove unused pow2/pow2f functions.
---
 Source/Core/Common/MathUtil.h                    |  7 +++----
 Source/Core/DolphinWX/GameListCtrl.cpp           |  2 +-
 Source/Core/VideoBackends/OGL/StreamBuffer.cpp   |  2 +-
 .../Core/VideoCommon/TextureConversionShader.cpp |  6 +++---
 Source/UnitTests/Common/MathUtilTest.cpp         | 16 ++++++++--------
 5 files changed, 16 insertions(+), 17 deletions(-)
diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h
index db35cbedcf..013e0b9733 100644
--- a/Source/Core/Common/MathUtil.h
+++ b/Source/Core/Common/MathUtil.h
@@ -166,16 +166,15 @@ struct Rectangle
 
 }  // namespace MathUtil
 
-inline float pow2f(float x) {return x * x;}
-inline double pow2(double x) {return x * x;}
-
 float MathFloatVectorSum(const std::vector<float>&);
 
 #define ROUND_UP(x, a)   (((x) + (a) - 1) & ~((a) - 1))
 #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
 
+inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
+
 // Rounds down. 0 -> undefined
-inline int Log2(u64 val)
+inline int IntLog2(u64 val)
 {
 #if defined(__GNUC__)
 	return 63 - __builtin_clzll(val);
diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp
index 53cdc272e2..84b2d8edff 100644
--- a/Source/Core/DolphinWX/GameListCtrl.cpp
+++ b/Source/Core/DolphinWX/GameListCtrl.cpp
@@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
 	// Find largest power of 2 less than _size.
 	// div 10 to get largest named unit less than _size
 	// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
-	const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
+	const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
 	const u64 unit_size = (1 << (unit * 10));
 
 	// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places
diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
index 91d4692b08..43cb1cb61b 100644
--- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
@@ -23,7 +23,7 @@ static u32 genBuffer()
 }
 
 StreamBuffer::StreamBuffer(u32 type, u32 size)
-: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
+: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
 {
 	m_iterator = 0;
 	m_used_iterator = 0;
diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index 76e49ce464..b0215e4c08 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 
 	WRITE(p, "  int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
 	WRITE(p, "  int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
-	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
-	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
+	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
+	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
 	if (samples == 1)
 	{
 		// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 		WRITE(p, "  x_virtual_position = x_virtual_position << 1;\n");
 	}
 	WRITE(p, "  int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
-	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);
+	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
 
 	WRITE(p, "  sampleUv.x = x_offset_in_block + x_block_position;\n");
 	WRITE(p, "  sampleUv.y = y_block_position + y_offset;\n");
diff --git a/Source/UnitTests/Common/MathUtilTest.cpp b/Source/UnitTests/Common/MathUtilTest.cpp
index 8ae757962c..9549039304 100644
--- a/Source/UnitTests/Common/MathUtilTest.cpp
+++ b/Source/UnitTests/Common/MathUtilTest.cpp
@@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
 	EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
 }
 
-TEST(MathUtil, Log2)
+TEST(MathUtil, IntLog2)
 {
-	EXPECT_EQ(0, Log2(1));
-	EXPECT_EQ(1, Log2(2));
-	EXPECT_EQ(2, Log2(4));
-	EXPECT_EQ(3, Log2(8));
-	EXPECT_EQ(63, Log2(0x8000000000000000ull));
+	EXPECT_EQ(0, IntLog2(1));
+	EXPECT_EQ(1, IntLog2(2));
+	EXPECT_EQ(2, IntLog2(4));
+	EXPECT_EQ(3, IntLog2(8));
+	EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
 
 	// Rounding behavior.
-	EXPECT_EQ(3, Log2(15));
-	EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
+	EXPECT_EQ(3, IntLog2(15));
+	EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
 }
 
 TEST(MathUtil, FlushToZero)

From 58dc802ce276de1d79131eea5b069157b0467fe9 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:09:10 -0700
Subject: [PATCH 02/13] JIT64: optimize multiplication by immediate constants

Factor out common code and handle a few more common cases.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 140 ++++++++----------
 2 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 5a515f1f81..47316f7944 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -118,6 +118,8 @@ public:
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 	void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
 
+	void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
+
 	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index ad30883018..f805a93279 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -5,6 +5,7 @@
 #include <limits>
 #include <vector>
 
+#include "Common/MathUtil.h"
 #include "Core/PowerPC/Jit64/Jit.h"
 #include "Core/PowerPC/Jit64/JitAsm.h"
 #include "Core/PowerPC/Jit64/JitRegCache.h"
@@ -1007,6 +1008,64 @@ void Jit64::subfx(UGeckoInstruction inst)
 	}
 }
 
+void Jit64::MultiplyImmediate(u32 imm, int a, int d, bool overflow)
+{
+	// simplest cases first
+	if (imm == 0)
+	{
+		XOR(32, gpr.R(d), gpr.R(d));
+		return;
+	}
+
+	if (imm == (u32)-1)
+	{
+		if (d != a)
+			MOV(32, gpr.R(d), gpr.R(a));
+		NEG(32, gpr.R(d));
+		return;
+	}
+
+	// skip these if we need to check overflow flag
+	if (!overflow)
+	{
+		// power of 2; just a shift
+		if (IsPow2(imm))
+		{
+			u32 shift = IntLog2(imm);
+			// use LEA if it saves an op
+			if (d != a && shift <= 3 && shift >= 1 && gpr.R(a).IsSimpleReg())
+			{
+				LEA(32, gpr.RX(d), MScaled(gpr.RX(a), SCALE_1 << shift, 0));
+			}
+			else
+			{
+				if (d != a)
+					MOV(32, gpr.R(d), gpr.R(a));
+				if (shift)
+					SHL(32, gpr.R(d), Imm8(shift));
+			}
+			return;
+		}
+
+		// We could handle factors of 2^N*3, 2^N*5, and 2^N*9 using lea+shl, but testing shows
+		// it seems to be slower overall.
+		static u8 lea_scales[3] = { 3, 5, 9 };
+		for (int i = 0; i < 3; i++)
+		{
+			if (imm == lea_scales[i])
+			{
+				if (d != a)
+					gpr.BindToRegister(a, true, false);
+				LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(a), SCALE_2 << i, 0));
+				return;
+			}
+		}
+	}
+
+	// if we didn't find any better options
+	IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm));
+}
+
 void Jit64::mulli(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@@ -1022,46 +1081,7 @@ void Jit64::mulli(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, d);
 		gpr.BindToRegister(d, (d == a), true);
-		if (imm == 0)
-		{
-			XOR(32, gpr.R(d), gpr.R(d));
-		}
-		else if (imm == (u32)-1)
-		{
-			if (d != a)
-				MOV(32, gpr.R(d), gpr.R(a));
-			NEG(32, gpr.R(d));
-		}
-		else if ((imm & (imm - 1)) == 0)
-		{
-			u32 shift = 0;
-
-			if (imm & 0xFFFF0000)
-				shift |= 16;
-
-			if (imm & 0xFF00FF00)
-				shift |= 8;
-
-			if (imm & 0xF0F0F0F0)
-				shift |= 4;
-
-			if (imm & 0xCCCCCCCC)
-				shift |= 2;
-
-			if (imm & 0xAAAAAAAA)
-				shift |= 1;
-
-			if (d != a)
-				MOV(32, gpr.R(d), gpr.R(a));
-
-			if (shift)
-				SHL(32, gpr.R(d), Imm8(shift));
-		}
-		else
-		{
-			IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm));
-		}
-
+		MultiplyImmediate(imm, a, d, false);
 		gpr.UnlockAll();
 	}
 }
@@ -1089,45 +1109,7 @@ void Jit64::mullwx(UGeckoInstruction inst)
 		{
 			u32 imm = gpr.R(a).IsImm() ? (u32)gpr.R(a).offset : (u32)gpr.R(b).offset;
 			int src = gpr.R(a).IsImm() ? b : a;
-			if (imm == 0)
-			{
-				XOR(32, gpr.R(d), gpr.R(d));
-			}
-			else if (imm == (u32)-1)
-			{
-				if (d != src)
-					MOV(32, gpr.R(d), gpr.R(src));
-				NEG(32, gpr.R(d));
-			}
-			else if ((imm & (imm - 1)) == 0 && !inst.OE)
-			{
-				u32 shift = 0;
-
-				if (imm & 0xFFFF0000)
-					shift |= 16;
-
-				if (imm & 0xFF00FF00)
-					shift |= 8;
-
-				if (imm & 0xF0F0F0F0)
-					shift |= 4;
-
-				if (imm & 0xCCCCCCCC)
-					shift |= 2;
-
-				if (imm & 0xAAAAAAAA)
-					shift |= 1;
-
-				if (d != src)
-					MOV(32, gpr.R(d), gpr.R(src));
-
-				if (shift)
-					SHL(32, gpr.R(d), Imm8(shift));
-			}
-			else
-			{
-				IMUL(32, gpr.RX(d), gpr.R(src), Imm32(imm));
-			}
+			MultiplyImmediate(imm, src, d, inst.OE);
 		}
 		else if (d == a)
 		{

From 41c3dde737d9dfbca01fce7ed1e9b561ab4023c8 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:21:00 -0700
Subject: [PATCH 03/13] JIT64: optimize rlwinmx/rlwinix and friends

Take advantage of movzx as a replacement for anding with 0xff or 0xffff, and
abuse loads from the register cache to save ops.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   4 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 174 +++++++++++-------
 2 files changed, 114 insertions(+), 64 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 47316f7944..79c1b9c36f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -107,6 +107,10 @@ public:
 	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);
 
+	// use to extract bytes from a register using the regcache. offset is in bytes.
+	Gen::OpArg ExtractFromReg(int reg, int offset);
+	void AndWithMask(Gen::X64Reg reg, u32 mask);
+
 	// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
 	// don't forget to xlock it before.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index f805a93279..8398a5e97c 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -138,6 +138,30 @@ void Jit64::ComputeRC(const Gen::OpArg & arg)
 	}
 }
 
+OpArg Jit64::ExtractFromReg(int reg, int offset)
+{
+	OpArg src = gpr.R(reg);
+	// store to load forwarding should handle this case efficiently
+	if (offset)
+	{
+		gpr.StoreFromRegister(reg, FLUSH_MAINTAIN_STATE);
+		src = gpr.GetDefaultLocation(reg);
+		src.offset += offset;
+	}
+	return src;
+}
+
+// we can't do this optimization in the emitter because MOVZX and AND have different effects on flags.
+void Jit64::AndWithMask(X64Reg reg, u32 mask)
+ {
+	if (mask == 0xff)
+		MOVZX(32, 8, reg, R(reg));
+	else if (mask == 0xffff)
+		MOVZX(32, 16, reg, R(reg));
+ 	else
+		AND(32, R(reg), Imm32(mask));
+}
+
 // Following static functions are used in conjunction with regimmop
 static u32 Add(u32 a, u32 b)
 {
@@ -1577,49 +1601,57 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		result &= Helper_Mask(inst.MB, inst.ME);
 		gpr.SetImmediate32(a, result);
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(a));
-		}
 	}
 	else
 	{
+		bool isLeftShift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH;
+		bool isRightShift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
+		u32 mask = Helper_Mask(inst.MB, inst.ME);
+		bool simpleMask = mask == 0xff || mask == 0xffff;
+		int maskSize = inst.ME - inst.MB + 1;
+
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s);
-		if (a != s)
+		if (a != s && isLeftShift && gpr.R(s).IsSimpleReg() && inst.SH <= 3)
 		{
-			MOV(32, gpr.R(a), gpr.R(s));
+			LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0));
 		}
-
-		if (inst.SH && inst.MB == 0 && inst.ME==31-inst.SH)
+		// common optimized case: byte/word extract
+		else if (simpleMask && !(inst.SH & (maskSize - 1)))
 		{
+			MOVZX(32, maskSize, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0));
+		}
+		// another optimized special case: byte/word extract plus shift
+		else if (((mask >> inst.SH) << inst.SH) == mask && !isLeftShift &&
+		         ((mask >> inst.SH) == 0xff || (mask >> inst.SH) == 0xffff))
+		{
+			MOVZX(32, maskSize, gpr.RX(a), gpr.R(s));
 			SHL(32, gpr.R(a), Imm8(inst.SH));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
-		}
-		else if (inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH)
-		{
-			SHR(32, gpr.R(a), Imm8(inst.MB));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
 		else
 		{
-			if (inst.SH != 0)
-			{
-				ROL(32, gpr.R(a), Imm8(inst.SH));
-			}
+			if (a != s)
+				MOV(32, gpr.R(a), gpr.R(s));
 
-			if (!(inst.MB==0 && inst.ME==31))
+			if (isLeftShift)
 			{
-				AND(32, gpr.R(a), Imm32(Helper_Mask(inst.MB, inst.ME)));
-				if (inst.Rc)
-					ComputeRC(gpr.R(a));
+				SHL(32, gpr.R(a), Imm8(inst.SH));
 			}
-			else if (inst.Rc)
+			else if (isRightShift)
 			{
-				ComputeRC(gpr.R(a));
+				SHR(32, gpr.R(a), Imm8(inst.MB));
+			}
+			else
+			{
+				if (inst.SH != 0)
+					ROL(32, gpr.R(a), Imm8(inst.SH));
+				if (!(inst.MB == 0 && inst.ME == 31))
+					AndWithMask(gpr.RX(a), mask);
 			}
 		}
+		if (inst.Rc)
+			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();
 	}
 }
@@ -1637,75 +1669,89 @@ void Jit64::rlwimix(UGeckoInstruction inst)
 		u32 mask = Helper_Mask(inst.MB,inst.ME);
 		gpr.SetImmediate32(a, ((u32)gpr.R(a).offset & ~mask) | (_rotl((u32)gpr.R(s).offset,inst.SH) & mask));
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(a));
-		}
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		gpr.BindToRegister(a, true, true);
 		u32 mask = Helper_Mask(inst.MB, inst.ME);
 		if (mask == 0 || (a == s && inst.SH == 0))
 		{
-			if (inst.Rc)
-			{
-				ComputeRC(gpr.R(a));
-			}
+			// nothing to do
 		}
 		else if (mask == 0xFFFFFFFF)
 		{
+			gpr.BindToRegister(a, a == s, true);
 			if (a != s)
-			{
 				MOV(32, gpr.R(a), gpr.R(s));
-			}
-
 			if (inst.SH)
-			{
 				ROL(32, gpr.R(a), Imm8(inst.SH));
-			}
-
-			if (inst.Rc)
-			{
-				ComputeRC(gpr.R(a));
-			}
+		}
+		else if(gpr.R(s).IsImm())
+		{
+			gpr.BindToRegister(a, true, true);
+			AndWithMask(gpr.RX(a), ~mask);
+			OR(32, gpr.R(a), Imm32(_rotl((u32)gpr.R(s).offset, inst.SH) & mask));
 		}
 		else if (inst.SH)
 		{
-			if (mask == 0U - (1U << inst.SH))
+			bool isLeftShift = mask == 0U - (1U << inst.SH);
+			bool isRightShift = mask == (1U << inst.SH) - 1;
+			if (gpr.R(a).IsImm())
 			{
-				MOV(32, R(EAX), gpr.R(s));
-				SHL(32, R(EAX), Imm8(inst.SH));
-				AND(32, gpr.R(a), Imm32(~mask));
-				OR(32, gpr.R(a), R(EAX));
-			}
-			else if (mask == (1U << inst.SH) - 1)
-			{
-				MOV(32, R(EAX), gpr.R(s));
-				SHR(32, R(EAX), Imm8(32-inst.SH));
-				AND(32, gpr.R(a), Imm32(~mask));
-				OR(32, gpr.R(a), R(EAX));
+				u32 maskA = gpr.R(a).offset & ~mask;
+				gpr.BindToRegister(a, false, true);
+				MOV(32, gpr.R(a), gpr.R(s));
+				if (isLeftShift)
+				{
+					SHL(32, gpr.R(a), Imm8(inst.SH));
+				}
+				else if (isRightShift)
+				{
+					SHR(32, gpr.R(a), Imm8(32 - inst.SH));
+				}
+				else
+				{
+					ROL(32, gpr.R(a), Imm8(inst.SH));
+					AND(32, gpr.R(a), Imm32(mask));
+				}
+				OR(32, gpr.R(a), Imm32(maskA));
 			}
 			else
 			{
+				// TODO: common cases of this might be faster with pinsrb or abuse of AH
+				gpr.BindToRegister(a, true, true);
 				MOV(32, R(EAX), gpr.R(s));
-				ROL(32, R(EAX), Imm8(inst.SH));
-				XOR(32, R(EAX), gpr.R(a));
-				AND(32, R(EAX), Imm32(mask));
-				XOR(32, gpr.R(a), R(EAX));
+				if (isLeftShift)
+				{
+					SHL(32, R(EAX), Imm8(inst.SH));
+					AndWithMask(gpr.RX(a), ~mask);
+					OR(32, gpr.R(a), R(EAX));
+				}
+				else if (isRightShift)
+				{
+					SHR(32, R(EAX), Imm8(32 - inst.SH));
+					AndWithMask(gpr.RX(a), ~mask);
+					OR(32, gpr.R(a), R(EAX));
+				}
+				else
+				{
+					ROL(32, R(EAX), Imm8(inst.SH));
+					XOR(32, R(EAX), gpr.R(a));
+					AndWithMask(EAX, mask);
+					XOR(32, gpr.R(a), R(EAX));
+				}
 			}
-
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
 		else
 		{
+			gpr.BindToRegister(a, true, true);
 			XOR(32, gpr.R(a), gpr.R(s));
-			AND(32, gpr.R(a), Imm32(~mask));
+			AndWithMask(gpr.RX(a), ~mask);
 			XOR(32, gpr.R(a), gpr.R(s));
-			if (inst.Rc)
-				ComputeRC(gpr.R(a));
 		}
+		if (inst.Rc)
+			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();
 	}
 }
@@ -1736,7 +1782,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
 			MOV(32, gpr.R(a), gpr.R(s));
 		}
 		ROL(32, gpr.R(a), R(ECX));
-		AND(32, gpr.R(a), Imm32(mask));
+		AndWithMask(gpr.RX(a), mask);
 		if (inst.Rc)
 			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();

From 61af91ff163468cd9d99a9229001f614155ecc1e Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:24:27 -0700
Subject: [PATCH 04/13] JIT64: Optimize cmpXX

Use TEST instead of CMP if we're comparing against 0 (rather common), and
optimize the case of immediate compares further.
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 8398a5e97c..1d43d2b406 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -479,13 +479,29 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 				MOVZX(64, 32, RAX, gpr.R(a));
 
 			if (comparand.IsImm())
-				MOV(32, R(ABI_PARAM1), comparand);
+			{
+				// sign extension will ruin this, so store it in a register
+				if (comparand.offset & 0x80000000U)
+				{
+					MOV(32, R(ABI_PARAM1), comparand);
+					comparand = R(ABI_PARAM1);
+				}
+			}
 			else
+			{
 				MOVZX(64, 32, ABI_PARAM1, comparand);
-
-			comparand = R(ABI_PARAM1);
+				comparand = R(ABI_PARAM1);
+			}
+		}
+		if (comparand.IsImm() && !comparand.offset)
+		{
+			if (merge_branch)
+				TEST(64, R(RAX), R(RAX));
+		}
+		else
+		{
+			SUB(64, R(RAX), comparand);
 		}
-		SUB(64, R(RAX), comparand);
 		MOV(64, M(&PowerPC::ppcState.cr_val[crf]), R(RAX));
 
 		if (merge_branch)

From 355850f499fd15a5bc6df987040d6c6a0987154b Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:25:56 -0700
Subject: [PATCH 05/13] JIT64: optimize sign/zero-extend

Also remove some comments that no longer apply since x86_32 was dropped.
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 1d43d2b406..45a164de98 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -796,11 +796,7 @@ void Jit64::extsbx(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		// Always force moving to EAX because it isn't possible
-		// to refer to the lowest byte of some registers, at least in
-		// 32-bit mode.
-		MOV(32, R(EAX), gpr.R(s));
-		MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends
+		MOVSX(32, 8, gpr.RX(a), gpr.R(s));
 		gpr.UnlockAll();
 	}
 
@@ -823,11 +819,7 @@ void Jit64::extshx(UGeckoInstruction inst)
 	else
 	{
 		gpr.Lock(a, s);
-		gpr.KillImmediate(s, true, false);
 		gpr.BindToRegister(a, a == s, true);
-		// This looks a little dangerous, but it's safe because
-		// every 32-bit register has a 16-bit half at the same index
-		// as the 32-bit register.
 		MOVSX(32, 16, gpr.RX(a), gpr.R(s));
 		gpr.UnlockAll();
 	}

From cd0c52b537628cdfcce483cbbbdf310c62666dc0 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:26:46 -0700
Subject: [PATCH 06/13] JIT64: avoid using LEA for adds when not necessary

---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 45a164de98..bb3bd15969 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1428,13 +1428,6 @@ void Jit64::addx(UGeckoInstruction inst)
 			GenerateConstantOverflow((s64)i + (s64)j);
 		}
 	}
-	else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.Rc && !inst.OE)
-	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-		LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0));
-		gpr.UnlockAll();
-	}
 	else if ((d == a) || (d == b))
 	{
 		int operand = ((d == a) ? b : a);
@@ -1447,6 +1440,15 @@ void Jit64::addx(UGeckoInstruction inst)
 			ComputeRC(gpr.R(d));
 		gpr.UnlockAll();
 	}
+	else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.OE)
+	{
+		gpr.Lock(a, b, d);
+		gpr.BindToRegister(d, false);
+		LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0));
+		if (inst.Rc)
+			ComputeRC(gpr.R(d));
+		gpr.UnlockAll();
+	}
 	else
 	{
 		gpr.Lock(a, b, d);

From 27996a65cfef64701a3b6e1a43bf982531ed000c Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:54:37 -0700
Subject: [PATCH 07/13] JIT64: use LEA for the "a = b + imm" case of addi

---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index bb3bd15969..a5e492b162 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -208,8 +208,15 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 		else
 		{
 			gpr.BindToRegister(d, false);
-			MOV(32, gpr.R(d), gpr.R(a));
-			(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+			if (doop == Add && gpr.R(a).IsSimpleReg() && !carry)
+			{
+				LEA(32, gpr.RX(d), MDisp(gpr.RX(a), value));
+			}
+			else
+			{
+				MOV(32, gpr.R(d), gpr.R(a));
+				(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+			}
 			if (carry)
 				GenerateCarry();
 			if (Rc)

From ad51fc7c4b2c7bd5df6aa86d235f573f1d0507dd Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 12:45:16 -0700
Subject: [PATCH 08/13] JIT64: use xor instead of mov for loading a zero
 regcache immediate

---
 Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
index 695d68b578..632e1d1694 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@@ -299,7 +299,10 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
 
 void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc)
 {
-	emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
+	if (regs[preg].location.IsImm() && !regs[preg].location.offset)
+		emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc));
+	else
+		emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
 }
 
 void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc)

From ee24d4714a1ed3995870927b906af4ffe39b527d Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:29:58 -0700
Subject: [PATCH 09/13] JIT64: tweak srwx/slwx BindToRegister arguments

Register B gets immediately moved into the shift register, so even if a == b
it doesn't need to be loaded.
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index a5e492b162..1dd80562de 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1859,8 +1859,8 @@ void Jit64::srwx(UGeckoInstruction inst)
 	{
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));
@@ -1897,8 +1897,8 @@ void Jit64::slwx(UGeckoInstruction inst)
 	{
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));

From 805be80f1277a6a11d789b7c742c59048daa84a1 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 24 Aug 2014 11:35:57 -0700
Subject: [PATCH 10/13] JIT64: Optimize carry handling

Carries are rather common and unpredictable, so do them branchlessly wherever
we can.
---
 Source/Core/Core/PowerPC/Gekko.h              |   9 +-
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   4 -
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 260 ++++--------------
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   |   2 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |  17 +-
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |   3 +-
 6 files changed, 84 insertions(+), 211 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h
index 99cc750ee1..9354cc9738 100644
--- a/Source/Core/Core/PowerPC/Gekko.h
+++ b/Source/Core/Core/PowerPC/Gekko.h
@@ -331,9 +331,12 @@ union UFPR
 	float f[2];
 };
 
-#define XER_CA_MASK 0x20000000
-#define XER_OV_MASK 0x40000000
-#define XER_SO_MASK 0x80000000
+#define XER_CA_SHIFT 29
+#define XER_OV_SHIFT 30
+#define XER_SO_SHIFT 31
+#define XER_CA_MASK (1U << XER_CA_SHIFT)
+#define XER_OV_MASK (1U << XER_OV_SHIFT)
+#define XER_SO_MASK (1U << XER_SO_SHIFT)
 // XER
 union UReg_XER
 {
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 79c1b9c36f..d6eb895b47 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -101,10 +101,6 @@ public:
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
 	void FinalizeCarryOverflow(bool oe, bool inv = false);
-	void GetCarryEAXAndClear();
-	void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false);
-	void GenerateCarry();
-	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);
 
 	// use to extract bytes from a register using the regcache. offset is in bytes.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 1dd80562de..8f11862754 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -31,6 +31,7 @@ void Jit64::GenerateConstantOverflow(bool overflow)
 	}
 }
 
+// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer.
 void Jit64::GenerateOverflow()
 {
 	FixupBranch jno = J_CC(CC_NO);
@@ -49,82 +50,24 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 	// USES_XER
 	if (oe)
 	{
+		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
+		// sides of the branch.
 		FixupBranch jno = J_CC(CC_NO);
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
 		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		// Do carry
-		FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry2);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 		SetJumpTarget(exit);
 	}
 	else
 	{
 		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(inv ? CC_NC : CC_C);
 	}
 }
 
-void Jit64::GetCarryEAXAndClear()
-{
-	MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER]));
-	BTR(32, R(EAX), Imm8(29));
-}
-
-// Assumes that XER is in EAX and that the CA bit is clear.
-void Jit64::FinalizeCarryGenerateOverflowEAX(bool oe, bool inv)
-{
-	// USES_XER
-	if (oe)
-	{
-		FixupBranch jno = J_CC(CC_NO);
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(EAX), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry1);
-		//XER[OV/SO] = 1
-		OR(32, R(EAX), Imm32(XER_SO_MASK | XER_OV_MASK));
-		FixupBranch exit = J();
-		SetJumpTarget(jno);
-		// Do carry
-		FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(EAX), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry2);
-		//XER[OV] = 0
-		AND(32, R(EAX), Imm32(~XER_OV_MASK));
-		SetJumpTarget(exit);
-	}
-	else
-	{
-		// Do carry
-		FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC);
-		OR(32, R(EAX), Imm32(XER_CA_MASK));
-		SetJumpTarget(carry1);
-	}
-	// Dump EAX back into XER
-	MOV(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX));
-}
-
-// Assumes that the flags were just set through an addition.
-void Jit64::GenerateCarry()
-{
-	// USES_XER
-	FixupBranch pNoCarry = J_CC(CC_NC);
-	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK));
-	FixupBranch pContinue = J();
-	SetJumpTarget(pNoCarry);
-	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(XER_CA_MASK)));
-	SetJumpTarget(pContinue);
-}
-
 void Jit64::ComputeRC(const Gen::OpArg & arg)
 {
 	if (arg.IsImm())
@@ -153,12 +96,12 @@ OpArg Jit64::ExtractFromReg(int reg, int offset)
 
 // we can't do this optimization in the emitter because MOVZX and AND have different effects on flags.
 void Jit64::AndWithMask(X64Reg reg, u32 mask)
- {
+{
 	if (mask == 0xff)
 		MOVZX(32, 8, reg, R(reg));
 	else if (mask == 0xffff)
 		MOVZX(32, 16, reg, R(reg));
- 	else
+	else
 		AND(32, R(reg), Imm32(mask));
 }
 
@@ -188,22 +131,16 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 	gpr.Lock(d, a);
 	if (a || binary || carry)  // yeh nasty special case addic
 	{
+		if (carry)
+			JitClearCAOV(false);
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
-			if (Rc)
-			{
-				ComputeRC(gpr.R(d));
-			}
 		}
 		else if (a == d)
 		{
 			gpr.KillImmediate(d, true, true);
 			(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
-			if (carry)
-				GenerateCarry();
-			if (Rc)
-				ComputeRC(gpr.R(d));
 		}
 		else
 		{
@@ -217,11 +154,11 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 				MOV(32, gpr.R(d), gpr.R(a));
 				(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
 			}
-			if (carry)
-				GenerateCarry();
-			if (Rc)
-				ComputeRC(gpr.R(d));
 		}
+		if (carry)
+			JitSetCAIf(CC_C);
+		if (Rc)
+			ComputeRC(gpr.R(d));
 	}
 	else if (doop == Add)
 	{
@@ -849,13 +786,11 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCA();
+			JitClearCAOV(false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			FixupBranch carry1 = J_CC(CC_C);
-			JitSetCA();
-			SetJumpTarget(carry1);
+			JitSetCAIf(CC_NC);
 		}
 		else if (imm == -1)
 		{
@@ -865,24 +800,20 @@ void Jit64::subfic(UGeckoInstruction inst)
 		}
 		else
 		{
-			JitClearCA();
+			JitClearCAOV(false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			FixupBranch carry1 = J_CC(CC_NC);
-			JitSetCA();
-			SetJumpTarget(carry1);
+			JitSetCAIf(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCA();
+		JitClearCAOV(false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		FixupBranch carry1 = J_CC(CC_C);
-		JitSetCA();
-		SetJumpTarget(carry1);
+		JitSetCAIf(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
@@ -927,7 +858,7 @@ void Jit64::subfex(UGeckoInstruction inst)
 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, (d == a || d == b), true);
 
-	GetCarryEAXAndClear();
+	JitGetAndClearCAOV(inst.OE);
 
 	bool invertedCarry = false;
 	if (d == b)
@@ -948,7 +879,7 @@ void Jit64::subfex(UGeckoInstruction inst)
 		NOT(32, gpr.R(d));
 		ADC(32, gpr.R(d), gpr.R(b));
 	}
-	FinalizeCarryGenerateOverflowEAX(inst.OE, invertedCarry);
+	FinalizeCarryOverflow(inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -964,14 +895,12 @@ void Jit64::subfmex(UGeckoInstruction inst)
 	gpr.Lock(a, d);
 	gpr.BindToRegister(d, d == a);
 
-	GetCarryEAXAndClear();
+	JitGetAndClearCAOV(inst.OE);
 	if (d != a)
-	{
 		MOV(32, gpr.R(d), gpr.R(a));
-	}
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryGenerateOverflowEAX(inst.OE);
+	FinalizeCarryOverflow(inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -987,14 +916,12 @@ void Jit64::subfzex(UGeckoInstruction inst)
 	gpr.Lock(a, d);
 	gpr.BindToRegister(d, d == a);
 
-	GetCarryEAXAndClear();
+	JitGetAndClearCAOV(inst.OE);
 	if (d != a)
-	{
 		MOV(32, gpr.R(d), gpr.R(a));
-	}
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryGenerateOverflowEAX(inst.OE);
+	FinalizeCarryOverflow(inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -1012,13 +939,9 @@ void Jit64::subfx(UGeckoInstruction inst)
 		s32 i = (s32)gpr.R(b).offset, j = (s32)gpr.R(a).offset;
 		gpr.SetImmediate32(d, i - j);
 		if (inst.Rc)
-		{
 			ComputeRC(gpr.R(d));
-		}
 		if (inst.OE)
-		{
 			GenerateConstantOverflow((s64)i - (s64)j);
-		}
 	}
 	else
 	{
@@ -1477,31 +1400,22 @@ void Jit64::addex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, b = inst.RB, d = inst.RD;
 
+	gpr.Lock(a, b, d);
+	gpr.BindToRegister(d, (d == a) || (d == b));
+	JitGetAndClearCAOV(inst.OE);
 	if ((d == a) || (d == b))
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryEAXAndClear();
 		ADC(32, gpr.R(d), gpr.R((d == a) ? b : a));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
 	}
 	else
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryEAXAndClear();
 		MOV(32, gpr.R(d), gpr.R(a));
 		ADC(32, gpr.R(d), gpr.R(b));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
 	}
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::addcx(UGeckoInstruction inst)
@@ -1543,31 +1457,16 @@ void Jit64::addmex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, d = inst.RD;
 
-	if (d == a)
-	{
-		gpr.Lock(d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryEAXAndClear();
-		ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
-	else
-	{
-		gpr.Lock(a, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryEAXAndClear();
+	gpr.Lock(d);
+	gpr.BindToRegister(d, d == a);
+	JitGetAndClearCAOV(inst.OE);
+	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
+	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::addzex(UGeckoInstruction inst)
@@ -1577,31 +1476,16 @@ void Jit64::addzex(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA, d = inst.RD;
 
-	if (d == a)
-	{
-		gpr.Lock(d);
-		gpr.BindToRegister(d, true);
-
-		GetCarryEAXAndClear();
-		ADC(32, gpr.R(d), Imm8(0));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
-	else
-	{
-		gpr.Lock(a, d);
-		gpr.BindToRegister(d, false);
-
-		GetCarryEAXAndClear();
+	gpr.Lock(d);
+	gpr.BindToRegister(d, d == a);
+	JitGetAndClearCAOV(inst.OE);
+	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), Imm8(0));
-		FinalizeCarryGenerateOverflowEAX(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
-	}
+	ADC(32, gpr.R(d), Imm8(0));
+	FinalizeCarryOverflow(inst.OE);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::rlwinmx(UGeckoInstruction inst)
@@ -1792,8 +1676,8 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
 	{
 		gpr.FlushLockX(ECX);
 		gpr.Lock(a, b, s);
-		gpr.BindToRegister(a, (a == b || a == s), true);
 		MOV(32, R(ECX), gpr.R(b));
+		gpr.BindToRegister(a, (a == s), true);
 		if (a != s)
 		{
 			MOV(32, gpr.R(a), gpr.R(s));
@@ -1900,9 +1784,7 @@ void Jit64::slwx(UGeckoInstruction inst)
 		MOV(32, R(ECX), gpr.R(b));
 		gpr.BindToRegister(a, a == s, true);
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), gpr.R(s));
-		}
 		SHL(64, gpr.R(a), R(ECX));
 		if (inst.Rc)
 		{
@@ -1929,7 +1811,7 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.Lock(a, s, b);
 	gpr.FlushLockX(ECX);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCA();
+	JitClearCAOV(false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
@@ -1938,16 +1820,11 @@ void Jit64::srawx(UGeckoInstruction inst)
 	MOV(32, R(EAX), gpr.R(a));
 	SHR(64, gpr.R(a), Imm8(32));
 	TEST(32, gpr.R(a), R(EAX));
-	FixupBranch nocarry = J_CC(CC_Z);
-	JitSetCA();
-	SetJumpTarget(nocarry);
+	JitSetCAIf(CC_NZ);
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
-
 	if (inst.Rc)
-	{
 		ComputeRC(gpr.R(a));
-	}
 }
 
 void Jit64::srawix(UGeckoInstruction inst)
@@ -1961,21 +1838,14 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		JitClearCA();
+		JitClearCAOV(false);
 		MOV(32, R(EAX), gpr.R(s));
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), R(EAX));
-		}
 		SAR(32, gpr.R(a), Imm8(amount));
-		if (inst.Rc)
-			ComputeRC(gpr.R(a));
-		SHL(32, R(EAX), Imm8(32-amount));
+		SHL(32, R(EAX), Imm8(32 - amount));
 		TEST(32, R(EAX), gpr.R(a));
-		FixupBranch nocarry = J_CC(CC_Z);
-		JitSetCA();
-		SetJumpTarget(nocarry);
-		gpr.UnlockAll();
+		JitSetCAIf(CC_NZ);
 	}
 	else
 	{
@@ -1983,20 +1853,15 @@ void Jit64::srawix(UGeckoInstruction inst)
 		FALLBACK_IF(true);
 
 		gpr.Lock(a, s);
-		JitClearCA();
+		JitClearCAOV(false);
 		gpr.BindToRegister(a, a == s, true);
 
 		if (a != s)
-		{
 			MOV(32, gpr.R(a), gpr.R(s));
-		}
-
-		if (inst.Rc)
-		{
-			ComputeRC(gpr.R(a));
-		}
-		gpr.UnlockAll();
 	}
+	if (inst.Rc)
+		ComputeRC(gpr.R(a));
+	gpr.UnlockAll();
 }
 
 // count leading zeroes
@@ -2032,10 +1897,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst)
 	}
 
 	if (inst.Rc)
-	{
 		ComputeRC(gpr.R(a));
-		// TODO: Check PPC manual too
-	}
 }
 
 void Jit64::twx(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index a3b4a91881..c0abd0242c 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1110,7 +1110,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCA();
+			Jit->JitClearCAOV(false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 58340b072e..f701c95ee9 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -802,10 +802,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
 	OR(32, M(&FPSCR), R(EAX));
 }
 
-
-void EmuCodeBlock::JitClearCA()
+void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
 {
-	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	if (oe)
+		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
+	BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
 }
 
 void EmuCodeBlock::JitSetCA()
@@ -813,6 +814,16 @@ void EmuCodeBlock::JitSetCA()
 	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
 }
 
+// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
+// branchless calculation of CA is probably faster in general.
+void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
+{
+	SETcc(conditionCode, R(EAX));
+	MOVZX(32, 8, EAX, R(AL));
+	SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
+	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
+}
+
 void EmuCodeBlock::JitClearCAOV(bool oe)
 {
 	if (oe)
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index addce16e93..2ce315d20e 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -50,8 +50,9 @@ public:
 	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
 
 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
-	void JitClearCA();
+	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
+	void JitSetCAIf(Gen::CCFlags conditionCode);
 	void JitClearCAOV(bool oe);
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);

From 10d691a2779135f4a6e4683c5a501809d921e329 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 28 Aug 2014 10:21:46 -0700
Subject: [PATCH 11/13] JIT64: optimize some special cases of srawix

Shift by 31 and 1, both of which are pretty common, can be done in a few less
instructions. Tested with a hwtest.
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 8f11862754..893921f5e8 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1842,10 +1842,30 @@ void Jit64::srawix(UGeckoInstruction inst)
 		MOV(32, R(EAX), gpr.R(s));
 		if (a != s)
 			MOV(32, gpr.R(a), R(EAX));
-		SAR(32, gpr.R(a), Imm8(amount));
-		SHL(32, R(EAX), Imm8(32 - amount));
-		TEST(32, R(EAX), gpr.R(a));
-		JitSetCAIf(CC_NZ);
+		// some optimized common cases that can be done in slightly fewer ops
+		if (amount == 31)
+		{
+			SAR(32, gpr.R(a), Imm8(31));
+			NEG(32, R(EAX));                                     // EAX = input == INT_MIN ? INT_MIN : -input;
+			AND(32, R(EAX), Imm32(0x80000000));                  // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000
+			SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT));
+			XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN)
+		}
+		else if (amount == 1)
+		{
+			SHR(32, R(EAX), Imm8(31));                          // sign
+			AND(32, R(EAX), gpr.R(a));                          // (sign && carry)
+			SAR(32, gpr.R(a), Imm8(1));
+			SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
+			OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+		}
+		else
+		{
+			SAR(32, gpr.R(a), Imm8(amount));
+			SHL(32, R(EAX), Imm8(32 - amount));
+			TEST(32, R(EAX), gpr.R(a));
+			JitSetCAIf(CC_NZ);
+		}
 	}
 	else
 	{

From a40278b1c452a734fd131add07c560b59a5438f6 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 17 Aug 2014 23:12:16 -0700
Subject: [PATCH 12/13] JIT64: support merged branching for rlwinmx, too

Not quite as common a branch instruction as cmpwi, but close.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 205 ++++++++++--------
 2 files changed, 112 insertions(+), 95 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index d6eb895b47..2d23bc8fbb 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -106,6 +106,8 @@ public:
 	// use to extract bytes from a register using the regcache. offset is in bytes.
 	Gen::OpArg ExtractFromReg(int reg, int offset);
 	void AndWithMask(Gen::X64Reg reg, u32 mask);
+	bool CheckMergedBranch(int crf);
+	void DoMergedBranch();
 
 	// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
 	// don't forget to xlock it before.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 893921f5e8..b187ff3a71 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -264,6 +264,56 @@ void Jit64::reg_imm(UGeckoInstruction inst)
 	}
 }
 
+bool Jit64::CheckMergedBranch(int crf)
+{
+	const UGeckoInstruction& next = js.next_inst;
+	if (((next.OPCD == 16 /* bcx */) ||
+	    ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
+	    ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) &&
+	     (next.BO & BO_DONT_DECREMENT_FLAG) &&
+	    !(next.BO & BO_DONT_CHECK_CONDITION) &&
+	     (next.BI >> 2) == crf)
+		return true;
+	return false;
+}
+
+void Jit64::DoMergedBranch()
+{
+	// Code that handles successful PPC branching.
+	if (js.next_inst.OPCD == 16) // bcx
+	{
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.compilerPC + 4));
+
+		u32 destination;
+		if (js.next_inst.AA)
+			destination = SignExt16(js.next_inst.BD << 2);
+		else
+			destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
+		WriteExit(destination);
+	}
+	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
+	{
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.compilerPC + 4));
+		MOV(32, R(EAX), M(&CTR));
+		AND(32, R(EAX), Imm32(0xFFFFFFFC));
+		WriteExitDestInEAX();
+	}
+	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
+	{
+		MOV(32, R(EAX), M(&LR));
+		AND(32, R(EAX), Imm32(0xFFFFFFFC));
+		if (js.next_inst.LK)
+			MOV(32, M(&LR), Imm32(js.compilerPC + 4));
+		WriteExitDestInEAX();
+	}
+	else
+	{
+		PanicAlert("WTF invalid branch");
+	}
+}
+
 void Jit64::cmpXX(UGeckoInstruction inst)
 {
 	// USES_CR
@@ -272,23 +322,7 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 	int a = inst.RA;
 	int b = inst.RB;
 	int crf = inst.CRFD;
-
-	bool merge_branch = false;
-	int test_crf = js.next_inst.BI >> 2;
-	// Check if the next instruction is a branch - if it is, merge the two.
-	if (((js.next_inst.OPCD == 16 /* bcx */) ||
-	    ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528) /* bcctrx */) ||
-	    ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16) /* bclrx */)) &&
-	    (js.next_inst.BO & BO_DONT_DECREMENT_FLAG) &&
-	    !(js.next_inst.BO & BO_DONT_CHECK_CONDITION))
-	{
-			// Looks like a decent conditional branch that we can merge with.
-			// It only test CR, not CTR.
-			if (test_crf == crf)
-			{
-				merge_branch = true;
-			}
-	}
+	bool merge_branch = CheckMergedBranch(crf);
 
 	OpArg comparand;
 	bool signedCompare;
@@ -358,45 +392,13 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 			{
 				gpr.Flush();
 				fpr.Flush();
-
-				if (js.next_inst.OPCD == 16) // bcx
-				{
-					if (js.next_inst.LK)
-						MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-
-					u32 destination;
-					if (js.next_inst.AA)
-						destination = SignExt16(js.next_inst.BD << 2);
-					else
-						destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
-					WriteExit(destination);
-				}
-				else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
-				{
-					if (js.next_inst.LK)
-						MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-					MOV(32, R(EAX), M(&CTR));
-					AND(32, R(EAX), Imm32(0xFFFFFFFC));
-					WriteExitDestInEAX();
-				}
-				else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
-				{
-					MOV(32, R(EAX), M(&LR));
-					if (js.next_inst.LK)
-						MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-					WriteExitDestInEAX();
-				}
-				else
-				{
-					PanicAlert("WTF invalid branch");
-				}
+				DoMergedBranch();
 			}
-			else
+			else if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
 			{
-				if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
-				{
-					WriteExit(js.next_compilerPC + 4);
-				}
+				gpr.Flush();
+				fpr.Flush();
+				WriteExit(js.next_compilerPC + 4);
 			}
 		}
 	}
@@ -473,51 +475,12 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 			gpr.Flush(FLUSH_MAINTAIN_STATE);
 			fpr.Flush(FLUSH_MAINTAIN_STATE);
 
-			// Code that handles successful PPC branching.
-			if (js.next_inst.OPCD == 16) // bcx
-			{
-				if (js.next_inst.LK)
-					MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-
-				u32 destination;
-				if (js.next_inst.AA)
-					destination = SignExt16(js.next_inst.BD << 2);
-				else
-					destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
-				WriteExit(destination);
-			}
-			else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
-			{
-				if (js.next_inst.LK)
-					MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-
-				MOV(32, R(EAX), M(&CTR));
-				AND(32, R(EAX), Imm32(0xFFFFFFFC));
-				WriteExitDestInEAX();
-			}
-			else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
-			{
-				MOV(32, R(EAX), M(&LR));
-				AND(32, R(EAX), Imm32(0xFFFFFFFC));
-
-				if (js.next_inst.LK)
-					MOV(32, M(&LR), Imm32(js.compilerPC + 4));
-
-				WriteExitDestInEAX();
-			}
-			else
-			{
-				PanicAlert("WTF invalid branch");
-			}
+			DoMergedBranch();
 
 			SetJumpTarget(pDontBranch);
 
 			if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
-			{
-				gpr.Flush();
-				fpr.Flush();
 				WriteExit(js.next_compilerPC + 4);
-			}
 		}
 	}
 
@@ -1494,6 +1457,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 	JITDISABLE(bJITIntegerOff);
 	int a = inst.RA;
 	int s = inst.RS;
+
+	// rlwinm is commonly used as a branch test, second only to the more obvious cmpw.
+	// since it's almost never used with any check other than beq, only support beq for simplicity.
+	bool merge_branch = inst.Rc && CheckMergedBranch(0) && (js.next_inst.BI & 3) == 2;
+
 	if (gpr.R(s).IsImm())
 	{
 		u32 result = (int)gpr.R(s).offset;
@@ -1510,6 +1478,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		bool isRightShift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
 		u32 mask = Helper_Mask(inst.MB, inst.ME);
 		bool simpleMask = mask == 0xff || mask == 0xffff;
+		// in case of a merged branch, track whether or not we've set flags.
+		// if not, we need to do a TEST later to get them.
+		bool needsTest = false;
+		// if we know the high bit can't be set, we can avoid doing a sign extend for flag storage
+		bool needsSext = true;
 		int maskSize = inst.ME - inst.MB + 1;
 
 		gpr.Lock(a, s);
@@ -1517,11 +1490,14 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		if (a != s && isLeftShift && gpr.R(s).IsSimpleReg() && inst.SH <= 3)
 		{
 			LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0));
+			needsTest = true;
 		}
 		// common optimized case: byte/word extract
 		else if (simpleMask && !(inst.SH & (maskSize - 1)))
 		{
 			MOVZX(32, maskSize, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0));
+			needsTest = true;
+			needsSext = false;
 		}
 		// another optimized special case: byte/word extract plus shift
 		else if (((mask >> inst.SH) << inst.SH) == mask && !isLeftShift &&
@@ -1529,6 +1505,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 		{
 			MOVZX(32, maskSize, gpr.RX(a), gpr.R(s));
 			SHL(32, gpr.R(a), Imm8(inst.SH));
+			needsSext = inst.SH + maskSize >= 32;
 		}
 		else
 		{
@@ -1542,16 +1519,54 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
 			else if (isRightShift)
 			{
 				SHR(32, gpr.R(a), Imm8(inst.MB));
+				needsSext = false;
 			}
 			else
 			{
 				if (inst.SH != 0)
 					ROL(32, gpr.R(a), Imm8(inst.SH));
 				if (!(inst.MB == 0 && inst.ME == 31))
+				{
 					AndWithMask(gpr.RX(a), mask);
+					needsSext = inst.MB == 0;
+					needsTest = simpleMask;
+				}
+				else
+				{
+					needsTest = true;
+				}
 			}
 		}
-		if (inst.Rc)
+		if (merge_branch)
+		{
+			js.downcountAmount++;
+			js.skipnext = true;
+
+			if (needsSext)
+				MOVSX(64, 32, gpr.RX(a), gpr.R(a));
+			MOV(64, M(&PowerPC::ppcState.cr_val[0]), gpr.R(a));
+
+			if (needsTest)
+				TEST(32, gpr.R(a), gpr.R(a));
+
+			gpr.UnlockAll();
+			FixupBranch pDontBranch = J_CC((js.next_inst.BO & BO_BRANCH_IF_TRUE) ? CC_NE : CC_E, true);
+
+			gpr.Flush(FLUSH_MAINTAIN_STATE);
+			fpr.Flush(FLUSH_MAINTAIN_STATE);
+
+			DoMergedBranch();
+
+			SetJumpTarget(pDontBranch);
+
+			if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
+			{
+				gpr.Flush();
+				fpr.Flush();
+				WriteExit(js.next_compilerPC + 4);
+			}
+		}
+		else if (inst.Rc)
 			ComputeRC(gpr.R(a));
 		gpr.UnlockAll();
 	}

From 3aa40dab001319b9f488c8e9f9a2ab3cd2f2d661 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 21 Aug 2014 13:56:18 -0700
Subject: [PATCH 13/13] JIT64: optimize carry calculations

Omit carry calculations that get overwritten later in the block before they're
used. Very common in the case of srawix and friends.
---
 .../Interpreter/Interpreter_Tables.cpp        |   8 +-
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |   4 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 131 ++++++++++--------
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   |   2 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |  10 +-
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |   2 +-
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       |  42 ++++--
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   4 +-
 9 files changed, 119 insertions(+), 86 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
index 2bf66ae99b..317132266d 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, Interpreter::cmpli,        {"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{11, Interpreter::cmpi,         {"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{12, Interpreter::addic,        {"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
-	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
+	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
 	{14, Interpreter::addi,         {"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 	{15, Interpreter::addis,        {"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 
@@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
 	{922, Interpreter::extshx,      {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{954, Interpreter::extsbx,      {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{536, Interpreter::srwx,        {"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
+	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{24,  Interpreter::slwx,        {"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 
 	{54,   Interpreter::dcbst,      {"dcbst",  OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
 	{339, Interpreter::mfspr,       {"mfspr",  OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
 	{467, Interpreter::mtspr,       {"mtspr",  OPTYPE_SPR, 0, 2, 0, 0, 0}},
 	{371, Interpreter::mftb,        {"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
-	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
+	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
 	{595, Interpreter::mfsr,        {"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 	{659, Interpreter::mfsrin,      {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 2d23bc8fbb..a07968eabb 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -100,7 +100,7 @@ public:
 	void GenerateConstantOverflow(bool overflow);
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
-	void FinalizeCarryOverflow(bool oe, bool inv = false);
+	void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
 	void ComputeRC(const Gen::OpArg & arg);
 
 	// use to extract bytes from a register using the regcache. offset is in bytes.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index 098c8f61e4..b965d6e91b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
 	{922, &Jit64::extshx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{954, &Jit64::extsbx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
+	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
+	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
 	{24,  &Jit64::slwx},                   //"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 
 	{54,   &Jit64::dcbst},                 //"dcbst",  OPTYPE_DCACHE, 0, 4}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index b187ff3a71..d0c6983c20 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -45,7 +45,7 @@ void Jit64::GenerateOverflow()
 }
 
 // Assumes CA,OV are clear
-void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
+void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv)
 {
 	// USES_XER
 	if (oe)
@@ -53,15 +53,17 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
 		// sides of the branch.
 		FixupBranch jno = J_CC(CC_NO);
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		if (ca)
+			JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
 		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		if (ca)
+			JitSetCAIf(inv ? CC_NC : CC_C);
 		SetJumpTarget(exit);
 	}
-	else
+	else if (ca)
 	{
 		// Do carry
 		JitSetCAIf(inv ? CC_NC : CC_C);
@@ -129,10 +131,10 @@ static u32 Xor(u32 a, u32 b)
 void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
 {
 	gpr.Lock(d, a);
+	carry &= js.op->wantsCA;
 	if (a || binary || carry)  // yeh nasty special case addic
 	{
-		if (carry)
-			JitClearCAOV(false);
+		JitClearCAOV(carry, false);
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
@@ -749,34 +751,38 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCAOV(false);
+			JitClearCAOV(js.op->wantsCA, false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			JitSetCAIf(CC_NC);
+			if (js.op->wantsCA)
+				JitSetCAIf(CC_NC);
 		}
 		else if (imm == -1)
 		{
 			// CA is always set in this case
-			JitSetCA();
+			if (js.op->wantsCA)
+				JitSetCA();
 			NOT(32, gpr.R(d));
 		}
 		else
 		{
-			JitClearCAOV(false);
+			JitClearCAOV(js.op->wantsCA, false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			JitSetCAIf(CC_C);
+			if (js.op->wantsCA)
+				JitSetCAIf(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCAOV(false);
+		JitClearCAOV(js.op->wantsCA, false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		JitSetCAIf(CC_NC);
+		if (js.op->wantsCA)
+			JitSetCAIf(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
@@ -789,8 +795,7 @@ void Jit64::subfcx(UGeckoInstruction inst)
 	int a = inst.RA, b = inst.RB, d = inst.RD;
 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, (d == a || d == b), true);
-
-	JitClearCAOV(inst.OE);
+	JitClearCAOV(js.op->wantsCA, inst.OE);
 	if (d == b)
 	{
 		SUB(32, gpr.R(d), gpr.R(a));
@@ -808,7 +813,7 @@ void Jit64::subfcx(UGeckoInstruction inst)
 	}
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
-	FinalizeCarryOverflow(inst.OE, true);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, true);
 
 	gpr.UnlockAll();
 }
@@ -842,7 +847,7 @@ void Jit64::subfex(UGeckoInstruction inst)
 		NOT(32, gpr.R(d));
 		ADC(32, gpr.R(d), gpr.R(b));
 	}
-	FinalizeCarryOverflow(inst.OE, invertedCarry);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -863,7 +868,7 @@ void Jit64::subfmex(UGeckoInstruction inst)
 		MOV(32, gpr.R(d), gpr.R(a));
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -884,7 +889,7 @@ void Jit64::subfzex(UGeckoInstruction inst)
 		MOV(32, gpr.R(d), gpr.R(a));
 	NOT(32, gpr.R(d));
 	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 
@@ -1375,7 +1380,7 @@ void Jit64::addex(UGeckoInstruction inst)
 		MOV(32, gpr.R(d), gpr.R(a));
 		ADC(32, gpr.R(d), gpr.R(b));
 	}
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1392,9 +1397,9 @@ void Jit64::addcx(UGeckoInstruction inst)
 		int operand = ((d == a) ? b : a);
 		gpr.Lock(a, b, d);
 		gpr.BindToRegister(d, true);
-		JitClearCAOV(inst.OE);
+		JitClearCAOV(js.op->wantsCA, inst.OE);
 		ADD(32, gpr.R(d), gpr.R(operand));
-		FinalizeCarryOverflow(inst.OE);
+		FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 		if (inst.Rc)
 			ComputeRC(gpr.R(d));
 		gpr.UnlockAll();
@@ -1403,10 +1408,10 @@ void Jit64::addcx(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, b, d);
 		gpr.BindToRegister(d, false);
-		JitClearCAOV(inst.OE);
+		JitClearCAOV(js.op->wantsCA, inst.OE);
 		MOV(32, gpr.R(d), gpr.R(a));
 		ADD(32, gpr.R(d), gpr.R(b));
-		FinalizeCarryOverflow(inst.OE);
+		FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 		if (inst.Rc)
 			ComputeRC(gpr.R(d));
 		gpr.UnlockAll();
@@ -1426,7 +1431,7 @@ void Jit64::addmex(UGeckoInstruction inst)
 	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
 	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1445,7 +1450,7 @@ void Jit64::addzex(UGeckoInstruction inst)
 	if (d != a)
 		MOV(32, gpr.R(d), gpr.R(a));
 	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1826,16 +1831,23 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.Lock(a, s, b);
 	gpr.FlushLockX(ECX);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCAOV(false);
+	JitClearCAOV(js.op->wantsCA, false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
 	SHL(64, gpr.R(a), Imm8(32));
 	SAR(64, gpr.R(a), R(ECX));
-	MOV(32, R(EAX), gpr.R(a));
-	SHR(64, gpr.R(a), Imm8(32));
-	TEST(32, gpr.R(a), R(EAX));
-	JitSetCAIf(CC_NZ);
+	if (js.op->wantsCA)
+	{
+		MOV(32, R(EAX), gpr.R(a));
+		SHR(64, gpr.R(a), Imm8(32));
+		TEST(32, gpr.R(a), R(EAX));
+		JitSetCAIf(CC_NZ);
+	}
+	else
+	{
+		SHR(64, gpr.R(a), Imm8(32));
+	}
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
 	if (inst.Rc)
@@ -1853,33 +1865,42 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		JitClearCAOV(false);
-		MOV(32, R(EAX), gpr.R(s));
-		if (a != s)
-			MOV(32, gpr.R(a), R(EAX));
-		// some optimized common cases that can be done in slightly fewer ops
-		if (amount == 31)
+		if (!js.op->wantsCA)
 		{
-			SAR(32, gpr.R(a), Imm8(31));
-			NEG(32, R(EAX));                                     // EAX = input == INT_MIN ? INT_MIN : -input;
-			AND(32, R(EAX), Imm32(0x80000000));                  // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000
-			SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT));
-			XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN)
-		}
-		else if (amount == 1)
-		{
-			SHR(32, R(EAX), Imm8(31));                          // sign
-			AND(32, R(EAX), gpr.R(a));                          // (sign && carry)
-			SAR(32, gpr.R(a), Imm8(1));
-			SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
-			OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+			if (a != s)
+				MOV(32, gpr.R(a), gpr.R(s));
+			SAR(32, gpr.R(a), Imm8(amount));
 		}
 		else
 		{
-			SAR(32, gpr.R(a), Imm8(amount));
-			SHL(32, R(EAX), Imm8(32 - amount));
-			TEST(32, R(EAX), gpr.R(a));
-			JitSetCAIf(CC_NZ);
+			JitClearCAOV(true, false);
+			MOV(32, R(EAX), gpr.R(s));
+			if (a != s)
+				MOV(32, gpr.R(a), R(EAX));
+			// some optimized common cases that can be done in slightly fewer ops
+			if (amount == 31)
+			{
+				SAR(32, gpr.R(a), Imm8(31));
+				NEG(32, R(EAX));                                     // EAX = input == INT_MIN ? INT_MIN : -input;
+				AND(32, R(EAX), Imm32(0x80000000));                  // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000
+				SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT));
+				XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN)
+			}
+			else if (amount == 1)
+			{
+				SHR(32, R(EAX), Imm8(31));                          // sign
+				AND(32, R(EAX), gpr.R(a));                          // (sign && carry)
+				SAR(32, gpr.R(a), Imm8(1));
+				SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
+				OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+			}
+			else
+			{
+				SAR(32, gpr.R(a), Imm8(amount));
+				SHL(32, R(EAX), Imm8(32 - amount));
+				TEST(32, R(EAX), gpr.R(a));
+				JitSetCAIf(CC_NZ);
+			}
 		}
 	}
 	else
@@ -1888,7 +1909,7 @@ void Jit64::srawix(UGeckoInstruction inst)
 		FALLBACK_IF(true);
 
 		gpr.Lock(a, s);
-		JitClearCAOV(false);
+		JitClearCAOV(js.op->wantsCA, false);
 		gpr.BindToRegister(a, a == s, true);
 
 		if (a != s)
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index c0abd0242c..f34b22774d 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1110,7 +1110,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCAOV(false);
+			Jit->JitClearCAOV(true, false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index f701c95ee9..1c04a7c7f1 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -824,10 +824,10 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
 	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
 }
 
-void EmuCodeBlock::JitClearCAOV(bool oe)
+void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
 {
-	if (oe)
-		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
-	else
-		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
+	if (mask == 0xFFFFFFFF)
+		return;
+	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
 }
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 2ce315d20e..579215a171 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -53,7 +53,7 @@ public:
 	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
 	void JitSetCAIf(Gen::CCFlags conditionCode);
-	void JitClearCAOV(bool oe);
+	void JitClearCAOV(bool ca, bool oe);
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 2c81a8447a..741b453739 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 {
 	code->wantsCR0 = false;
 	code->wantsCR1 = false;
-	code->wantsPS1 = false;
 
 	if (opinfo->flags & FL_USE_FPU)
 		block->m_fpa->any = true;
@@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
 	code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
 
+	code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
+	code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
+
+	// mfspr/mtspr can affect/use XER, so be super careful here
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
+		code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
+		code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+
 	int numOut = 0;
 	int numIn = 0;
 	if (opinfo->flags & FL_OUT_A)
@@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		block->m_broken = true;
 	}
 
-	// Scan for CR0 dependency
-	// assume next block wants flags to be safe
+	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
+	// wants flags, to be safe.
 	bool wantsCR0 = true;
 	bool wantsCR1 = true;
-	bool wantsPS1 = true;
 	bool wantsFPRF = true;
+	bool wantsCA = true;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
-		wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
-		wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
-		wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
-		code[i].wantsCR0 = wantsCR0;
-		code[i].wantsCR1 = wantsCR1;
-		code[i].wantsPS1 = wantsPS1;
+		bool opWantsCR0  = code[i].wantsCR0;
+		bool opWantsCR1  = code[i].wantsCR1;
+		bool opWantsFPRF = code[i].wantsFPRF;
+		bool opWantsCA   = code[i].wantsCA;
+		wantsCR0  |= opWantsCR0  || code[i].canEndBlock;
+		wantsCR1  |= opWantsCR1  || code[i].canEndBlock;
+		wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
+		wantsCA   |= opWantsCA   || code[i].canEndBlock;
+		code[i].wantsCR0  = wantsCR0;
+		code[i].wantsCR1  = wantsCR1;
 		code[i].wantsFPRF = wantsFPRF;
-		wantsCR0 &= !code[i].outputCR0;
-		wantsCR1 &= !code[i].outputCR1;
-		wantsPS1 &= !code[i].outputPS1;
-		wantsFPRF &= !code[i].outputFPRF;
+		code[i].wantsCA   = wantsCA;
+		wantsCR0  &= !code[i].outputCR0  || opWantsCR0;
+		wantsCR1  &= !code[i].outputCR1  || opWantsCR1;
+		wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
+		wantsCA   &= !code[i].outputCA   || opWantsCA;
 	}
 	return address;
 }
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 0916e3951e..2177889336 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -33,12 +33,12 @@ struct CodeOp //16B
 	bool isBranchTarget;
 	bool wantsCR0;
 	bool wantsCR1;
-	bool wantsPS1;
 	bool wantsFPRF;
+	bool wantsCA;
 	bool outputCR0;
 	bool outputCR1;
-	bool outputPS1;
 	bool outputFPRF;
+	bool outputCA;
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 };