diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 280e58d82a..f2763d83a8 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -887,21 +887,46 @@ void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
 void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src) {WriteMOVBE(bits, 0xF0, dest, src);}
 void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src) {WriteMOVBE(bits, 0xF1, src, dest);}
 
-void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src)
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend)
 {
-	if (cpu_info.bMOVBE)
+	switch (size)
 	{
-		MOVBE(size, dst, src);
-	}
-	else
-	{
-		MOV(size, R(dst), src);
-		BSWAP(size, dst);
+	case 8:
+		if (sign_extend)
+			MOVSX(32, 8, dst, src);
+		else
+			MOVZX(32, 8, dst, src);
+		break;
+	case 16:
+		MOVZX(32, 16, dst, src);
+		if (sign_extend)
+		{
+			BSWAP(32, dst);
+			SAR(32, R(dst), Imm8(16));
+		}
+		else
+		{
+			ROL(16, R(dst), Imm8(8));
+		}
+		break;
+	case 32:
+	case 64:
+		if (cpu_info.bMOVBE)
+		{
+			MOVBE(size, dst, src);
+		}
+		else
+		{
+			MOV(size, R(dst), src);
+			BSWAP(size, dst);
+		}
+		break;
 	}
 }
 
-void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src)
+u8* XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src)
 {
+	u8* mov_location = GetWritableCodePtr();
 	if (cpu_info.bMOVBE)
 	{
 		MOVBE(size, dst, src);
@@ -909,8 +934,10 @@ void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src)
 	else
 	{
 		BSWAP(size, src);
+		mov_location = GetWritableCodePtr();
 		MOV(size, dst, R(src));
 	}
+	return mov_location;
 }
 
 
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index aa21956453..d444724705 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -480,8 +480,8 @@ public:
 	// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
 	void MOVBE(int bits, X64Reg dest, const OpArg& src);
 	void MOVBE(int bits, const OpArg& dest, X64Reg src);
-	void LoadAndSwap(int size, X64Reg dst, const OpArg& src);
-	void SwapAndStore(int size, const OpArg& dst, X64Reg src);
+	void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false);
+	u8* SwapAndStore(int size, const OpArg& dst, X64Reg src);
 
 	// Available only on AMD >= Phenom or Intel >= Haswell
 	void LZCNT(int bits, X64Reg dest, const OpArg& src);
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
index 81b22972f4..fcfb6d4db1 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
@@ -86,33 +86,35 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
 
 	// Compute the start and length of the memory operation, including
 	// any byteswapping.
-	int totalSize;
+	int totalSize = info.instructionSize;
 	u8 *start = codePtr;
 	if (!info.isMemoryWrite)
 	{
-		int bswapNopCount;
-		if (info.byteSwap || info.operandSize == 1)
-			bswapNopCount = 0;
-		// Check the following BSWAP for REX byte
-		else if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
-			bswapNopCount = 3;
-		else
-			bswapNopCount = 2;
-
-		totalSize = info.instructionSize + bswapNopCount;
-		if (info.operandSize == 2 && !info.byteSwap)
+		// MOVBE and single bytes don't need to be swapped.
+		if (!info.byteSwap && info.operandSize > 1)
 		{
+			// REX
 			if ((codePtr[totalSize] & 0xF0) == 0x40)
+				totalSize++;
+
+			// BSWAP
+			if (codePtr[totalSize] == 0x0F && (codePtr[totalSize + 1] & 0xF8) == 0xC8)
+				totalSize += 2;
+
+			if (info.operandSize == 2)
 			{
-				++totalSize;
+				// operand size override
+				if (codePtr[totalSize] == 0x66)
+					totalSize++;
+				// REX
+				if ((codePtr[totalSize] & 0xF0) == 0x40)
+					totalSize++;
+				// SAR/ROL
+				_assert_(codePtr[totalSize] == 0xC1 && (codePtr[totalSize + 2] == 0x10 ||
+				                                        codePtr[totalSize + 2] == 0x08));
+				info.signExtend = (codePtr[totalSize + 1] & 0x10) != 0;
+				totalSize += 3;
 			}
-			if (codePtr[totalSize] != 0xc1 || codePtr[totalSize + 2] != 0x10)
-			{
-				PanicAlert("BackPatch: didn't find expected shift %p", codePtr);
-				return false;
-			}
-			info.signExtend = (codePtr[totalSize + 1] & 0x10) != 0;
-			totalSize += 3;
 		}
 	}
 	else
@@ -120,7 +122,6 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
 		if (info.byteSwap || info.hasImmediate)
 		{
 			// The instruction is a MOVBE but it failed so the value is still in little-endian byte order.
-			totalSize = info.instructionSize;
 		}
 		else
 		{
@@ -146,7 +147,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
 				break;
 			}
 			start = codePtr - bswapSize;
-			totalSize = info.instructionSize + bswapSize;
+			totalSize += bswapSize;
 		}
 	}
 
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index a4a8a6212c..c9911ae853 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -24,24 +24,8 @@ void EmuCodeBlock::MemoryExceptionCheck()
 
 void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
 {
-	MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
-	if (accessSize == 32)
-	{
-		BSWAP(32, reg_value);
-	}
-	else if (accessSize == 16)
-	{
-		BSWAP(32, reg_value);
-		if (signExtend)
-			SAR(32, R(reg_value), Imm8(16));
-		else
-			SHR(32, R(reg_value), Imm8(16));
-	}
-	else if (signExtend)
-	{
-		// TODO: bake 8-bit into the original load.
-		MOVSX(32, accessSize, reg_value, R(reg_value));
-	}
+	OpArg src = MComplex(RMEM, reg_addr, SCALE_1, offset);
+	LoadAndSwap(accessSize, reg_value, src, signExtend);
 }
 
 void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
@@ -84,34 +68,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessS
 	}
 
 	result = GetWritableCodePtr();
-	if (accessSize == 8 && signExtend)
-		MOVSX(32, accessSize, reg_value, memOperand);
-	else
-		MOVZX(64, accessSize, reg_value, memOperand);
-
-	switch (accessSize)
-	{
-	case 8:
-		_dbg_assert_(DYNA_REC, BACKPATCH_SIZE - (GetCodePtr() - result <= 0));
-		break;
-
-	case 16:
-		BSWAP(32, reg_value);
-		if (signExtend)
-			SAR(32, R(reg_value), Imm8(16));
-		else
-			SHR(32, R(reg_value), Imm8(16));
-		break;
-
-	case 32:
-		BSWAP(32, reg_value);
-		break;
-
-	case 64:
-		BSWAP(64, reg_value);
-		break;
-	}
-
+	LoadAndSwap(accessSize, reg_value, memOperand, signExtend);
 	return result;
 }
 
@@ -415,17 +372,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce
 	}
 	else if (swap)
 	{
-		if (cpu_info.bMOVBE)
-		{
-			MOVBE(accessSize, dest, reg_value.GetSimpleReg());
-		}
-		else
-		{
-			if (accessSize > 8)
-				BSWAP(accessSize, reg_value.GetSimpleReg());
-			result = GetWritableCodePtr();
-			MOV(accessSize, dest, reg_value);
-		}
+		result = SwapAndStore(accessSize, dest, reg_value.GetSimpleReg());
 	}
 	else
 	{
diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp
index a298d7e1dd..dd9cd0528d 100644
--- a/Source/Core/VideoCommon/VertexLoaderX64.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp
@@ -53,21 +53,12 @@ OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute)
 	OpArg data = MDisp(src_reg, m_src_ofs);
 	if (attribute & MASK_INDEXED)
 	{
-		if (attribute == INDEX8)
-		{
-			MOVZX(64, 8, scratch1, data);
-			m_src_ofs += 1;
-		}
-		else
-		{
-			MOV(16, R(scratch1), data);
-			m_src_ofs += 2;
-			BSWAP(16, scratch1);
-			MOVZX(64, 16, scratch1, R(scratch1));
-		}
+		int bits = attribute == INDEX8 ? 8 : 16;
+		LoadAndSwap(bits, scratch1, data);
+		m_src_ofs += bits / 8;
 		if (array == ARRAY_POSITION)
 		{
-			CMP(attribute == INDEX8 ? 8 : 16, R(scratch1), Imm8(-1));
+			CMP(bits, R(scratch1), Imm8(-1));
 			m_skip_vertex = J_CC(CC_E, true);
 		}
 		IMUL(32, scratch1, MPIC(&g_main_cp_state.array_strides[array]));