PPU: some instructions replaced

2025-03-15 22:21:25 +00:00 · 2015-03-26 21:42:12 +03:00 · 2015-03-26 21:42:12 +03:00 · d640aba903
commit d640aba903
parent 75fa95c7ff
4 changed files with 38 additions and 118 deletions
--- a/Utilities/BEType.h
+++ b/Utilities/BEType.h
@ -307,16 +307,6 @@ union _CRT_ALIGN(16) u128
 		return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
 	}

-	static __forceinline u128 gtu8(const u128& left, const u128& right)
-	{
-		return fromV(_mm_cmpgt_epu8(left.vi, right.vi));
-	}
-
-	static __forceinline u128 leu8(const u128& left, const u128& right)
-	{
-		return fromV(_mm_cmple_epu8(left.vi, right.vi));
-	}
-
 	bool operator == (const u128& right) const
 	{
 		return (_u64[0] == right._u64[0]) && (_u64[1] == right._u64[1]);
--- a/Utilities/GNU.h
+++ b/Utilities/GNU.h
@ -342,15 +342,21 @@ static __forceinline uint64_t cntlz64(uint64_t arg)
 }

 // compare 16 packed unsigned bytes (greater than)
-static __forceinline __m128i _mm_cmpgt_epu8(__m128i A, __m128i B)
+inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
 {
 	// (A xor 0x80) > (B xor 0x80)
-	return _mm_cmpgt_epi8(_mm_xor_si128(A, _mm_set1_epi8(-128)), _mm_xor_si128(B, _mm_set1_epi8(-128)));
+	const auto sign = _mm_set1_epi32(0x80808080);
+	return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
 }

-// compare 16 packed unsigned bytes (less or equal)
-static __forceinline __m128i _mm_cmple_epu8(__m128i A, __m128i B)
+inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
 {
-	// ((B xor 0x80) > (A xor 0x80)) || A == B
-	return _mm_or_si128(_mm_cmpgt_epu8(B, A), _mm_cmpeq_epi8(A, B));
+	const auto sign = _mm_set1_epi32(0x80008000);
+	return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
+}
+
+inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
+{
+	const auto sign = _mm_set1_epi32(0x80000000);
+	return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
 }
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -64,10 +64,9 @@ void ppu_interpreter::MTVSCR(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VADDCUW(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint w = 0; w < 4; w++)
-	{
-		CPU.VPR[op.vd]._u32[w] = ~CPU.VPR[op.va]._u32[w] < CPU.VPR[op.vb]._u32[w];
-	}
+	const auto a = CPU.VPR[op.va].vi;
+	const auto b = CPU.VPR[op.vb].vi;
+	CPU.VPR[op.vd].vi = _mm_srli_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))), 31);
 }

 void ppu_interpreter::VADDFP(PPUThread& CPU, ppu_opcode_t op)
@ -77,59 +76,23 @@ void ppu_interpreter::VADDFP(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VADDSBS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (u32 b = 0; b < 16; ++b)
-	{
-		s16 result = (s16)CPU.VPR[op.va]._s8[b] + (s16)CPU.VPR[op.vb]._s8[b];
-
-		if (result > 0x7f)
-		{
-			CPU.VPR[op.vd]._s8[b] = 0x7f;
-		}
-		else if (result < -0x80)
-		{
-			CPU.VPR[op.vd]._s8[b] = -0x80;
-		}
-		else
-			CPU.VPR[op.vd]._s8[b] = (s8)result;
-	}
+	CPU.VPR[op.vd].vi = _mm_adds_epi8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VADDSHS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint h = 0; h < 8; h++)
-	{
-		s32 result = (s32)CPU.VPR[op.va]._s16[h] + (s32)CPU.VPR[op.vb]._s16[h];
-
-		if (result > 0x7fff)
-		{
-			CPU.VPR[op.vd]._s16[h] = 0x7fff;
-		}
-		else if (result < -0x8000)
-		{
-			CPU.VPR[op.vd]._s16[h] = -0x8000;
-		}
-		else
-			CPU.VPR[op.vd]._s16[h] = result;
-	}
+	CPU.VPR[op.vd].vi = _mm_adds_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VADDSWS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint w = 0; w < 4; w++)
-	{
-		s64 result = (s64)CPU.VPR[op.va]._s32[w] + (s64)CPU.VPR[op.vb]._s32[w];
-
-		if (result > 0x7fffffff)
-		{
-			CPU.VPR[op.vd]._s32[w] = 0x7fffffff;
-		}
-		else if (result < (s32)0x80000000)
-		{
-			CPU.VPR[op.vd]._s32[w] = 0x80000000;
-		}
-		else
-			CPU.VPR[op.vd]._s32[w] = (s32)result;
-	}
+	const auto a = CPU.VPR[op.va];
+	const auto b = CPU.VPR[op.vb];
+	const auto s = u128::add32(a, b); // a + b
+	const auto m = (a ^ s) & (b ^ s); // overflow bit
+	const auto x = _mm_srai_epi32(m.vi, 31); // saturation mask
+	const auto y = _mm_srai_epi32(_mm_and_si128(s.vi, m.vi), 31); // positive saturation mask
+	CPU.VPR[op.vd].vi = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s.vi, x));
 }

 void ppu_interpreter::VADDUBM(PPUThread& CPU, ppu_opcode_t op)
@ -139,17 +102,7 @@ void ppu_interpreter::VADDUBM(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VADDUBS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint b = 0; b < 16; b++)
-	{
-		u16 result = (u16)CPU.VPR[op.va]._u8[b] + (u16)CPU.VPR[op.vb]._u8[b];
-
-		if (result > 0xff)
-		{
-			CPU.VPR[op.vd]._u8[b] = 0xff;
-		}
-		else
-			CPU.VPR[op.vd]._u8[b] = (u8)result;
-	}
+	CPU.VPR[op.vd].vi = _mm_adds_epu8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VADDUHM(PPUThread& CPU, ppu_opcode_t op)
@ -159,17 +112,7 @@ void ppu_interpreter::VADDUHM(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VADDUHS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint h = 0; h < 8; h++)
-	{
-		u32 result = (u32)CPU.VPR[op.va]._u16[h] + (u32)CPU.VPR[op.vb]._u16[h];
-
-		if (result > 0xffff)
-		{
-			CPU.VPR[op.vd]._u16[h] = 0xffff;
-		}
-		else
-			CPU.VPR[op.vd]._u16[h] = result;
-	}
+	CPU.VPR[op.vd].vi = _mm_adds_epu16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VADDUWM(PPUThread& CPU, ppu_opcode_t op)
@ -179,17 +122,9 @@ void ppu_interpreter::VADDUWM(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VADDUWS(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint w = 0; w < 4; w++)
-	{
-		u64 result = (u64)CPU.VPR[op.va]._u32[w] + (u64)CPU.VPR[op.vb]._u32[w];
-
-		if (result > 0xffffffff)
-		{
-			CPU.VPR[op.vd]._u32[w] = 0xffffffff;
-		}
-		else
-			CPU.VPR[op.vd]._u32[w] = (u32)result;
-	}
+	const auto a = CPU.VPR[op.va].vi;
+	const auto b = CPU.VPR[op.vb].vi;
+	CPU.VPR[op.vd].vi = _mm_or_si128(_mm_add_epi32(a, b), _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))));
 }

 void ppu_interpreter::VAND(PPUThread& CPU, ppu_opcode_t op)
@ -228,16 +163,12 @@ void ppu_interpreter::VAVGSW(PPUThread& CPU, ppu_opcode_t op)

 void ppu_interpreter::VAVGUB(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint b = 0; b < 16; b++)
-		CPU.VPR[op.vd]._u8[b] = (CPU.VPR[op.va]._u8[b] + CPU.VPR[op.vb]._u8[b] + 1) >> 1;
+	CPU.VPR[op.vd].vi = _mm_avg_epu8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VAVGUH(PPUThread& CPU, ppu_opcode_t op)
 {
-	for (uint h = 0; h < 8; h++)
-	{
-		CPU.VPR[op.vd]._u16[h] = (CPU.VPR[op.va]._u16[h] + CPU.VPR[op.vb]._u16[h] + 1) >> 1;
-	}
+	CPU.VPR[op.vd].vi = _mm_avg_epu16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
 }

 void ppu_interpreter::VAVGUW(PPUThread& CPU, ppu_opcode_t op)
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -91,10 +91,7 @@ void spu_interpreter::OR(SPUThread& CPU, spu_opcode_t op)

 void spu_interpreter::BG(SPUThread& CPU, spu_opcode_t op)
 {
-	for (u32 i = 0; i < 4; i++)
-	{
-		CPU.GPR[op.rt]._u32[i] = CPU.GPR[op.ra]._u32[i] <= CPU.GPR[op.rb]._u32[i];
-	}
+	CPU.GPR[op.rt].vi = _mm_add_epi32(sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi), _mm_set1_epi32(1));
 }

 void spu_interpreter::SFH(SPUThread& CPU, spu_opcode_t op)
@ -264,10 +261,9 @@ void spu_interpreter::AND(SPUThread& CPU, spu_opcode_t op)

 void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op)
 {
-	for (u32 i = 0; i < 4; i++)
-	{
-		CPU.GPR[op.rt]._u32[i] = ~CPU.GPR[op.ra]._u32[i] < CPU.GPR[op.rb]._u32[i];
-	}
+	const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff));
+	const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000));
+	CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
 }

 void spu_interpreter::AH(SPUThread& CPU, spu_opcode_t op)
@ -665,8 +661,7 @@ void spu_interpreter::XSBH(SPUThread& CPU, spu_opcode_t op)

 void spu_interpreter::CLGT(SPUThread& CPU, spu_opcode_t op)
 {
-	const auto sign = _mm_set1_epi32(0x80000000);
-	CPU.GPR[op.rt].vi = _mm_cmpgt_epi32(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
+	CPU.GPR[op.rt].vi = sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
 }

 void spu_interpreter::ANDC(SPUThread& CPU, spu_opcode_t op)
@ -701,8 +696,7 @@ void spu_interpreter::FM(SPUThread& CPU, spu_opcode_t op)

 void spu_interpreter::CLGTH(SPUThread& CPU, spu_opcode_t op)
 {
-	const auto sign = _mm_set1_epi32(0x80008000);
-	CPU.GPR[op.rt].vi = _mm_cmpgt_epi16(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
+	CPU.GPR[op.rt].vi = sse_cmpgt_epu16(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
 }

 void spu_interpreter::ORC(SPUThread& CPU, spu_opcode_t op)
@ -738,8 +732,7 @@ void spu_interpreter::DFM(SPUThread& CPU, spu_opcode_t op)

 void spu_interpreter::CLGTB(SPUThread& CPU, spu_opcode_t op)
 {
-	const auto sign = _mm_set1_epi32(0x80808080);
-	CPU.GPR[op.rt].vi = _mm_cmpgt_epi8(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
+	CPU.GPR[op.rt].vi = sse_cmpgt_epu8(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
 }

 void spu_interpreter::HLGT(SPUThread& CPU, spu_opcode_t op)