PPU LLVM: rewrite and optimize saturation bit

Use vector accumulator
2025-03-14 01:27:00 +00:00 · 2021-11-26 20:01:29 +03:00 · 2021-11-26 20:01:29 +03:00 · e3e39e8de3
commit e3e39e8de3
parent 209b14fbac
5 changed files with 110 additions and 129 deletions
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -528,14 +528,14 @@ inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)

 bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
+	ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat != v128{}} | (u32{ppu.nj} << 16));
 	return true;
 }

 bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const u32 vscr = ppu.vr[op.vb]._u32[3];
-	ppu.sat = (vscr & 1) != 0;
+	ppu.sat = v128::from32((vscr & 1) != 0);
 	ppu.nj  = (vscr & 0x10000) != 0;
 	ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
 	return true;
@ -577,12 +577,12 @@ bool ppu_interpreter_precise::VADDSBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum < INT8_MIN)
 		{
 			d._s8[i] = INT8_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum > INT8_MAX)
 		{
 			d._s8[i] = INT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -612,12 +612,12 @@ bool ppu_interpreter_precise::VADDSHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum < INT16_MIN)
 		{
 			d._s16[i] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum > INT16_MAX)
 		{
 			d._s16[i] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -654,12 +654,12 @@ bool ppu_interpreter_precise::VADDSWS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum < INT32_MIN)
 		{
 			d._s32[i] = INT32_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum > INT32_MAX)
 		{
 			d._s32[i] = INT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -695,7 +695,7 @@ bool ppu_interpreter_precise::VADDUBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > UINT8_MAX)
 		{
 			d._u8[i] = UINT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -731,7 +731,7 @@ bool ppu_interpreter_precise::VADDUHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > UINT16_MAX)
 		{
 			d._u16[i] = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -770,7 +770,7 @@ bool ppu_interpreter_precise::VADDUWS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > UINT32_MAX)
 		{
 			d._u32[i] = UINT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -988,13 +988,13 @@ bool ppu_interpreter_precise::VCTSXS(ppu_thread& ppu, ppu_opcode_t op)
 			}
 			else
 			{
-				ppu.sat = true;
+				ppu.sat._u32[0] = 1;
 				d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
 			}
 		}
 		else if (exp2 > 30)
 		{
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 			d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
 		}
 		else if (exp2 < 0)
@ -1041,13 +1041,13 @@ bool ppu_interpreter_precise::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
 			}
 			else
 			{
-				ppu.sat = true;
+				ppu.sat._u32[0] = 1;
 				d._u32[i] = sign ? 0 : 0xFFFFFFFF;
 			}
 		}
 		else if (exp2 > 31)
 		{
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 			d._u32[i] = sign ? 0 : 0xFFFFFFFF;
 		}
 		else if (exp2 < 0)
@ -1056,7 +1056,7 @@ bool ppu_interpreter_precise::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
 		}
 		else if (sign)
 		{
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 			d._u32[i] = 0;
 		}
 		else
@ -1180,12 +1180,12 @@ bool ppu_interpreter_precise::VMHADDSHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum < INT16_MIN)
 		{
 			d._s16[i] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum > INT16_MAX)
 		{
 			d._s16[i] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1229,12 +1229,12 @@ bool ppu_interpreter_precise::VMHRADDSHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum < INT16_MIN)
 		{
 			d._s16[i] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum > INT16_MAX)
 		{
 			d._s16[i] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1420,12 +1420,12 @@ bool ppu_interpreter_precise::VMSUMSHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > 0x7fffffff)
 		{
 			saturated = 0x7fffffff;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result < INT32_MIN)
 		{
 			saturated = 0x80000000;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			saturated = static_cast<s32>(result);
@ -1517,7 +1517,7 @@ bool ppu_interpreter_precise::VMSUMUHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > 0xffffffffu)
 		{
 			saturated = 0xffffffff;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			saturated = static_cast<u32>(result);
@ -1666,12 +1666,12 @@ bool ppu_interpreter_precise::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < INT8_MIN)
 		{
 			d._s8[i + 8] = INT8_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result > INT8_MAX)
 		{
 			d._s8[i + 8] = INT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1683,12 +1683,12 @@ bool ppu_interpreter_precise::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < INT8_MIN)
 		{
 			d._s8[i] = INT8_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result > INT8_MAX)
 		{
 			d._s8[i] = INT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1717,7 +1717,7 @@ bool ppu_interpreter_precise::VPKSHUS(ppu_thread& ppu, ppu_opcode_t op)
 		const auto all_bits = a | b;
 		if ((all_bits._u64[0] | all_bits._u64[1]) & mask)
 		{
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 	}

@ -1744,12 +1744,12 @@ bool ppu_interpreter_precise::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < INT16_MIN)
 		{
 			d._s16[i + 4] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result > INT16_MAX)
 		{
 			d._s16[i + 4] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1761,12 +1761,12 @@ bool ppu_interpreter_precise::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < INT16_MIN)
 		{
 			d._s16[i] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result > INT16_MAX)
 		{
 			d._s16[i] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -1828,12 +1828,12 @@ bool ppu_interpreter_precise::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT16_MAX)
 		{
 			result = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result < 0)
 		{
 			result = 0;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u16[h + 4] = result;
@ -1843,12 +1843,12 @@ bool ppu_interpreter_precise::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT16_MAX)
 		{
 			result = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result < 0)
 		{
 			result = 0;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u16[h] = result;
@ -1909,7 +1909,7 @@ bool ppu_interpreter_precise::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT8_MAX)
 		{
 			result = UINT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u8[b + 8] = static_cast<u8>(result);
@ -1919,7 +1919,7 @@ bool ppu_interpreter_precise::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT8_MAX)
 		{
 			result = UINT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u8[b] = static_cast<u8>(result);
@ -1980,7 +1980,7 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT16_MAX)
 		{
 			result = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u16[h + 4] = result;
@ -1990,7 +1990,7 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result > UINT16_MAX)
 		{
 			result = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}

 		d._u16[h] = result;
@ -2430,12 +2430,12 @@ bool ppu_interpreter_precise::VSUBSBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (diff < INT8_MIN)
 		{
 			d._s8[i] = INT8_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (diff > INT8_MAX)
 		{
 			d._s8[i] = INT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -2465,12 +2465,12 @@ bool ppu_interpreter_precise::VSUBSHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (diff < INT16_MIN)
 		{
 			d._s16[i] = INT16_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (diff > INT16_MAX)
 		{
 			d._s16[i] = INT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -2518,12 +2518,12 @@ bool ppu_interpreter_precise::VSUBSWS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < INT32_MIN)
 		{
 			d._s32[w] = INT32_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (result > INT32_MAX)
 		{
 			d._s32[w] = INT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._s32[w] = static_cast<s32>(result);
@ -2556,12 +2556,12 @@ bool ppu_interpreter_precise::VSUBUBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (diff < 0)
 		{
 			d._u8[i] = 0;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (diff > UINT8_MAX)
 		{
 			d._u8[i] = UINT8_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -2597,12 +2597,12 @@ bool ppu_interpreter_precise::VSUBUHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (diff < 0)
 		{
 			d._u16[i] = 0;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (diff > UINT16_MAX)
 		{
 			d._u16[i] = UINT16_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 		{
@ -2652,7 +2652,7 @@ bool ppu_interpreter_precise::VSUBUWS(ppu_thread& ppu, ppu_opcode_t op)
 		if (result < 0)
 		{
 			d._u32[w] = 0;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._u32[w] = static_cast<u32>(result);
@ -2704,12 +2704,12 @@ bool ppu_interpreter_precise::VSUMSWS(ppu_thread& ppu, ppu_opcode_t op)
 	if (sum > INT32_MAX)
 	{
 		d._s32[0] = INT32_MAX;
-		ppu.sat = true;
+		ppu.sat._u32[0] = 1;
 	}
 	else if (sum < INT32_MIN)
 	{
 		d._s32[0] = INT32_MIN;
-		ppu.sat = true;
+		ppu.sat._u32[0] = 1;
 	}
 	else
 		d._s32[0] = static_cast<s32>(sum);
@ -2756,12 +2756,12 @@ bool ppu_interpreter_precise::VSUM2SWS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > INT32_MAX)
 		{
 			d._s32[n * 2] = INT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum < INT32_MIN)
 		{
 			d._s32[n * 2] = INT32_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._s32[n * 2] = static_cast<s32>(sum);
@ -2820,12 +2820,12 @@ bool ppu_interpreter_precise::VSUM4SBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > INT32_MAX)
 		{
 			d._s32[w] = INT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum < INT32_MIN)
 		{
 			d._s32[w] = INT32_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._s32[w] = static_cast<s32>(sum);
@ -2880,12 +2880,12 @@ bool ppu_interpreter_precise::VSUM4SHS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > INT32_MAX)
 		{
 			d._s32[w] = INT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else if (sum < INT32_MIN)
 		{
 			d._s32[w] = INT32_MIN;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._s32[w] = static_cast<s32>(sum);
@ -2936,7 +2936,7 @@ bool ppu_interpreter_precise::VSUM4UBS(ppu_thread& ppu, ppu_opcode_t op)
 		if (sum > UINT32_MAX)
 		{
 			d._u32[w] = UINT32_MAX;
-			ppu.sat = true;
+			ppu.sat._u32[0] = 1;
 		}
 		else
 			d._u32[w] = static_cast<u32>(sum);
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -65,8 +65,6 @@
 #include "util/v128sse.hpp"
 #include "util/sysinfo.hpp"

-const bool s_use_ssse3 = utils::has_ssse3();
-
 extern atomic_t<u64> g_watchdog_hold_ctr;

 // Should be of the same type
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -222,27 +222,6 @@ public:
 	}
 	xer;

-	/*
-		Saturation. A sticky status bit indicating that some field in a saturating instruction saturated since the last
-		time SAT was cleared. In other words when SAT = '1' it remains set to '1' until it is cleared to '0' by an
-		mtvscr instruction.
-		1	The vector saturate instruction implicitly sets when saturation has occurred on the results one of
-			the vector instructions having saturate in its name:
-			Move To VSCR (mtvscr)
-			Vector Add Integer with Saturation (vaddubs, vadduhs, vadduws, vaddsbs, vaddshs,
-			vaddsws)
-			Vector Subtract Integer with Saturation (vsububs, vsubuhs, vsubuws, vsubsbs, vsubshs,
-			vsubsws)
-			Vector Multiply-Add Integer with Saturation (vmhaddshs, vmhraddshs)
-			Vector Multiply-Sum with Saturation (vmsumuhs, vmsumshs, vsumsws)
-			Vector Sum-Across with Saturation (vsumsws, vsum2sws, vsum4sbs, vsum4shs,
-			vsum4ubs)
-			Vector Pack with Saturation (vpkuhus, vpkuwus, vpkshus, vpkswus, vpkshss, vpkswss)
-			Vector Convert to Fixed-Point with Saturation (vctuxs, vctsxs)
-		0	Indicates no saturation occurred; mtvscr can explicitly clear this bit.
-	*/
-	bool sat{};
-
 	/*
 		Non-Java. A mode control bit that determines whether vector floating-point operations will be performed
 		in a Java-IEEE-C9X-compliant mode or a possibly faster non-Java/non-IEEE mode.
@ -255,6 +234,9 @@ public:
 	*/
 	bool nj = true;

+	// Sticky saturation bit
+	v128 sat{};
+
 	// Optimization: precomputed java-mode mask for handling denormals
 	u32 jm_mask = 0x7f80'0000;

--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -42,8 +42,9 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 	thread_struct.insert(thread_struct.end(), 2, GetType<u32>()); // vrsave, cia
 	thread_struct.insert(thread_struct.end(), 3, GetType<bool>()); // so, ov, ca
 	thread_struct.insert(thread_struct.end(), 1, GetType<u8>()); // cnt
-	thread_struct.insert(thread_struct.end(), 2, GetType<bool>()); // sat, nj
-	thread_struct.emplace_back(ArrayType::get(GetType<char>(), 2)); // Padding
+	thread_struct.insert(thread_struct.end(), 1, GetType<bool>()); // nj
+	thread_struct.emplace_back(ArrayType::get(GetType<char>(), 3)); // Padding
+	thread_struct.insert(thread_struct.end(), 1, GetType<u32[4]>()); // sat
 	thread_struct.insert(thread_struct.end(), 1, GetType<u32>()); // jm_mask

 	m_thread_type = StructType::create(m_context, thread_struct, "context_t");
@ -639,7 +640,7 @@ void PPUTranslator::CompilationError(const std::string& error)

 void PPUTranslator::MFVSCR(ppu_opcode_t op)
 {
-	const auto vscr = m_ir->CreateOr(ZExt(RegLoad(m_sat), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
+	const auto vscr = m_ir->CreateOr(ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
 	SetVr(op.vd, m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), vscr, m_ir->getInt32(m_is_be ? 3 : 0)));
 }

@ -649,7 +650,7 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op)
 	const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>());
 	RegStore(nj, m_nj);
 	if (g_cfg.core.llvm_ppu_jm_handling) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
-	RegStore(Trunc(vscr, GetType<bool>()), m_sat);
+	RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
 }

 void PPUTranslator::VADDCUW(ppu_opcode_t op)
@ -669,7 +670,7 @@ void PPUTranslator::VADDSBS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s8[16]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VADDSHS(ppu_opcode_t op)
@ -677,7 +678,7 @@ void PPUTranslator::VADDSHS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s16[8]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VADDSWS(ppu_opcode_t op)
@ -685,7 +686,7 @@ void PPUTranslator::VADDSWS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s32[4]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VADDUBM(ppu_opcode_t op)
@ -699,7 +700,7 @@ void PPUTranslator::VADDUBS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u8[16]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VADDUHM(ppu_opcode_t op)
@ -713,7 +714,7 @@ void PPUTranslator::VADDUHS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u16[8]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VADDUWM(ppu_opcode_t op)
@ -727,7 +728,7 @@ void PPUTranslator::VADDUWS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u32[4]>(op.va, op.vb);
 	const auto r = add_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a + b)).value));
+	set_sat(r ^ (a + b));
 }

 void PPUTranslator::VAND(ppu_opcode_t op)
@ -906,7 +907,7 @@ void PPUTranslator::VCTSXS(ppu_opcode_t op)
 	const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 31)));
 	const auto converted = fpcast<s32[4]>(select(sat_l, const1, scaled));
 	set_vr(op.vd, select(sat_h, splat<s32[4]>(0x7fff'ffff), converted));
-	SetSat(IsNotZero(eval(sat_l | sat_h).value));
+	set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
 }

 void PPUTranslator::VCTUXS(ppu_opcode_t op)
@ -919,7 +920,7 @@ void PPUTranslator::VCTUXS(ppu_opcode_t op)
 	const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32))); // TODO ???
 	const auto converted = fpcast<u32[4]>(select(sat_l, const0, scaled));
 	set_vr(op.vd, select(sat_h, splat<u32[4]>(0xffff'ffff), converted));
-	SetSat(IsNotZero(eval(sat_l | sat_h).value));
+	set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
 }

 void PPUTranslator::VEXPTEFP(ppu_opcode_t op)
@ -1038,7 +1039,7 @@ void PPUTranslator::VMHADDSHS(ppu_opcode_t op)
 	const auto m = ((sext<s32[8]>(a) * sext<s32[8]>(b)) >> 15) + sext<s32[8]>(c);
 	const auto r = trunc<u16[8]>(min(max(m, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((m + 0x8000) >> 16).value));
+	set_sat(trunc<u16[8]>((m + 0x8000) >> 16));
 }

 void PPUTranslator::VMHRADDSHS(ppu_opcode_t op)
@ -1048,7 +1049,7 @@ void PPUTranslator::VMHRADDSHS(ppu_opcode_t op)
 	const auto m = ((sext<s32[8]>(a) * sext<s32[8]>(b) + splat<s32[8]>(0x4000)) >> 15) + sext<s32[8]>(c);
 	const auto r = trunc<u16[8]>(min(max(m, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((m + 0x8000) >> 16).value));
+	set_sat(trunc<u16[8]>((m + 0x8000) >> 16));
 }

 void PPUTranslator::VMINFP(ppu_opcode_t op)
@ -1164,7 +1165,7 @@ void PPUTranslator::VMSUMSHS(ppu_opcode_t op)
 	const auto mx = eval(m ^ sext<s32[4]>(m == 0x80000000u));
 	const auto x = eval(((mx ^ s) & ~(c ^ mx)) >> 31);
 	set_vr(op.vd, eval((z & x) | (s & ~x)));
-	SetSat(IsNotZero(x.value));
+	set_sat(x);
 }

 void PPUTranslator::VMSUMUBM(ppu_opcode_t op)
@ -1193,7 +1194,7 @@ void PPUTranslator::VMSUMUHS(ppu_opcode_t op)
 	const auto s2 = eval(s + c);
 	const auto x = eval((s < ml) | (s2 < s));
 	set_vr(op.vd, select(x, splat<u32[4]>(-1), s2));
-	SetSat(IsNotZero(x.value));
+	set_sat(x);
 }

 void PPUTranslator::VMULESB(ppu_opcode_t op)
@ -1333,7 +1334,7 @@ void PPUTranslator::VPKSHSS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 	const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(-0x80)), splat<s16[16]>(0x7f)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(((a + 0x80) | (b + 0x80)) >> 8).value));
+	set_sat(((a + 0x80) | (b + 0x80)) >> 8);
 }

 void PPUTranslator::VPKSHUS(ppu_opcode_t op)
@ -1343,7 +1344,7 @@ void PPUTranslator::VPKSHUS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 	const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(0)), splat<s16[16]>(0xff)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((a | b) >> 8).value));
+	set_sat((a | b) >> 8);
 }

 void PPUTranslator::VPKSWSS(ppu_opcode_t op)
@ -1353,7 +1354,7 @@ void PPUTranslator::VPKSWSS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
 	const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(((a + 0x8000) | (b + 0x8000)) >> 16).value));
+	set_sat(((a + 0x8000) | (b + 0x8000)) >> 16);
 }

 void PPUTranslator::VPKSWUS(ppu_opcode_t op)
@ -1363,7 +1364,7 @@ void PPUTranslator::VPKSWUS(ppu_opcode_t op)
 	const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
 	const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(0)), splat<s32[8]>(0xffff)));
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((a | b) >> 16).value));
+	set_sat((a | b) >> 16);
 }

 void PPUTranslator::VPKUHUM(ppu_opcode_t op)
@ -1380,7 +1381,7 @@ void PPUTranslator::VPKUHUS(ppu_opcode_t op)
 	const auto tb = bitcast<u8[16]>(min(b, splat<u16[8]>(0xff)));
 	const auto r = shuffle2(tb, ta, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((a | b) >> 8).value));
+	set_sat((a | b) >> 8);
 }

 void PPUTranslator::VPKUWUM(ppu_opcode_t op)
@ -1397,7 +1398,7 @@ void PPUTranslator::VPKUWUS(ppu_opcode_t op)
 	const auto tb = bitcast<u16[8]>(min(b, splat<u32[4]>(0xffff)));
 	const auto r = shuffle2(tb, ta, 0, 2, 4, 6, 8, 10, 12, 14);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval((a | b) >> 16).value));
+	set_sat((a | b) >> 16);
 }

 void PPUTranslator::VREFP(ppu_opcode_t op)
@ -1670,7 +1671,7 @@ void PPUTranslator::VSUBSBS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s8[16]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUBSHS(ppu_opcode_t op)
@ -1678,7 +1679,7 @@ void PPUTranslator::VSUBSHS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s16[8]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUBSWS(ppu_opcode_t op)
@ -1686,7 +1687,7 @@ void PPUTranslator::VSUBSWS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<s32[4]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUBUBM(ppu_opcode_t op)
@ -1700,7 +1701,7 @@ void PPUTranslator::VSUBUBS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u8[16]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUBUHM(ppu_opcode_t op)
@ -1714,7 +1715,7 @@ void PPUTranslator::VSUBUHS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u16[8]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUBUWM(ppu_opcode_t op)
@ -1728,7 +1729,7 @@ void PPUTranslator::VSUBUWS(ppu_opcode_t op)
 	const auto [a, b] = get_vrs<u32[4]>(op.va, op.vb);
 	const auto r = sub_sat(a, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (a - b)).value));
+	set_sat(r ^ (a - b));
 }

 void PPUTranslator::VSUMSWS(ppu_opcode_t op)
@ -1740,7 +1741,7 @@ void PPUTranslator::VSUMSWS(ppu_opcode_t op)
 	const auto s = eval(x + y + z);
 	const auto r = min(max(zshuffle(s, 0, 2) + zshuffle(s, 1, 2), splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
 	set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 4, 4));
-	SetSat(IsNotZero(eval((r + 0x8000'0000) >> 32).value));
+	set_sat((r + 0x8000'0000) >> 32);
 }

 void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
@ -1751,7 +1752,7 @@ void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
 	const auto z = b >> 32;
 	const auto r = min(max(x + y + z, splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
 	set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 2, 4));
-	SetSat(IsNotZero(eval((r + 0x8000'0000) >> 32).value));
+	set_sat((r + 0x8000'0000) >> 32);
 }

 void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
@ -1765,7 +1766,7 @@ void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
 	const auto s = eval(x + y + z + w); // Can't overflow
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (s + b)).value));
+	set_sat(r ^ (s + b));
 }

 void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
@ -1777,7 +1778,7 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
 	const auto s = eval(x + y); // Can't overflow
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (s + b)).value));
+	set_sat(r ^ (s + b));
 }

 void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
@ -1791,7 +1792,7 @@ void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
 	const auto s = eval(x + y + z + w); // Can't overflow
 	const auto r = add_sat(s, b);
 	set_vr(op.vd, r);
-	SetSat(IsNotZero(eval(r != (s + b)).value));
+	set_sat(r ^ (s + b));
 }

 #define UNPACK_PIXEL_OP(px) (px & 0xff00001f) | ((px << 6) & 0x1f0000) | ((px << 3) & 0x1f00)
@ -4813,14 +4814,6 @@ void PPUTranslator::SetOverflow(Value* bit)
 	RegStore(m_ir->CreateOr(RegLoad(m_so), bit), m_so);
 }

-void PPUTranslator::SetSat(Value* bit)
-{
-	if (m_attr & ppu_attr::has_mfvscr)
-	{
-		RegStore(m_ir->CreateOr(RegLoad(m_sat), bit), m_sat);
-	}
-}
-
 Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)
 {
 	Value* trap_condition = m_ir->getFalse();
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -79,8 +79,8 @@ class PPUTranslator final : public cpu_translator
 	DEF_VALUE(m_ov, m_g_ov, 168) // XER.OV bit, overflow flag
 	DEF_VALUE(m_ca, m_g_ca, 169) // XER.CA bit, carry flag
 	DEF_VALUE(m_cnt, m_g_cnt, 170) // XER.CNT
-	DEF_VALUE(m_sat, m_g_sat, 171) // VSCR.SAT bit, sticky saturation flag
-	DEF_VALUE(m_nj, m_g_nj, 172) // VSCR.NJ bit, non-Java mode
+	DEF_VALUE(m_nj, m_g_nj, 171) // VSCR.NJ bit, non-Java mode
+	DEF_VALUE(m_sat, m_g_sat, 173) // VSCR.SAT bit, sticky saturation flag
 	DEF_VALUE(m_jm_mask, m_g_jm_mask, 174) // Java-Mode helper mask

 #undef DEF_VALUE
@ -118,6 +118,17 @@ public:
 		return result;
 	}

+	// Update sticky VSCR.SAT bit (|=)
+	template <typename T>
+	void set_sat(T&& expr)
+	{
+		if (m_attr & ppu_attr::has_mfvscr)
+		{
+			const auto val = expr.eval(m_ir);
+			RegStore(m_ir->CreateOr(m_ir->CreateBitCast(RegLoad(m_sat), val->getType()), val), m_sat);
+		}
+	}
+
 	// Get current instruction address
 	llvm::Value* GetAddr(u64 _add = 0);

@ -265,9 +276,6 @@ public:
 	// Set XER.OV bit, and update XER.SO bit (|=)
 	void SetOverflow(llvm::Value*);

-	// Update sticky VSCR.SAT bit (|=)
-	void SetSat(llvm::Value*);
-
 	// Check condition for trap instructions
 	llvm::Value* CheckTrapCondition(u32 to, llvm::Value* left, llvm::Value* right);