PPU: some instructions replaced

This commit is contained in:
Nekotekina 2015-03-26 21:42:12 +03:00
parent 75fa95c7ff
commit d640aba903
4 changed files with 38 additions and 118 deletions

View File

@ -307,16 +307,6 @@ union _CRT_ALIGN(16) u128
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
}
static __forceinline u128 gtu8(const u128& left, const u128& right)
{
return fromV(_mm_cmpgt_epu8(left.vi, right.vi));
}
static __forceinline u128 leu8(const u128& left, const u128& right)
{
return fromV(_mm_cmple_epu8(left.vi, right.vi));
}
bool operator == (const u128& right) const
{
return (_u64[0] == right._u64[0]) && (_u64[1] == right._u64[1]);

View File

@ -342,15 +342,21 @@ static __forceinline uint64_t cntlz64(uint64_t arg)
}
// compare 16 packed unsigned bytes (greater than)
static __forceinline __m128i _mm_cmpgt_epu8(__m128i A, __m128i B)
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
{
// (A xor 0x80) > (B xor 0x80)
return _mm_cmpgt_epi8(_mm_xor_si128(A, _mm_set1_epi8(-128)), _mm_xor_si128(B, _mm_set1_epi8(-128)));
const auto sign = _mm_set1_epi32(0x80808080);
return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
// compare 16 packed unsigned bytes (less or equal)
static __forceinline __m128i _mm_cmple_epu8(__m128i A, __m128i B)
inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
{
// ((B xor 0x80) > (A xor 0x80)) || A == B
return _mm_or_si128(_mm_cmpgt_epu8(B, A), _mm_cmpeq_epi8(A, B));
const auto sign = _mm_set1_epi32(0x80008000);
return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}

View File

@ -64,10 +64,9 @@ void ppu_interpreter::MTVSCR(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VADDCUW(PPUThread& CPU, ppu_opcode_t op)
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[op.vd]._u32[w] = ~CPU.VPR[op.va]._u32[w] < CPU.VPR[op.vb]._u32[w];
}
const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi;
CPU.VPR[op.vd].vi = _mm_srli_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))), 31);
}
void ppu_interpreter::VADDFP(PPUThread& CPU, ppu_opcode_t op)
@ -77,59 +76,23 @@ void ppu_interpreter::VADDFP(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VADDSBS(PPUThread& CPU, ppu_opcode_t op)
{
for (u32 b = 0; b < 16; ++b)
{
s16 result = (s16)CPU.VPR[op.va]._s8[b] + (s16)CPU.VPR[op.vb]._s8[b];
if (result > 0x7f)
{
CPU.VPR[op.vd]._s8[b] = 0x7f;
}
else if (result < -0x80)
{
CPU.VPR[op.vd]._s8[b] = -0x80;
}
else
CPU.VPR[op.vd]._s8[b] = (s8)result;
}
CPU.VPR[op.vd].vi = _mm_adds_epi8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VADDSHS(PPUThread& CPU, ppu_opcode_t op)
{
for (uint h = 0; h < 8; h++)
{
s32 result = (s32)CPU.VPR[op.va]._s16[h] + (s32)CPU.VPR[op.vb]._s16[h];
if (result > 0x7fff)
{
CPU.VPR[op.vd]._s16[h] = 0x7fff;
}
else if (result < -0x8000)
{
CPU.VPR[op.vd]._s16[h] = -0x8000;
}
else
CPU.VPR[op.vd]._s16[h] = result;
}
CPU.VPR[op.vd].vi = _mm_adds_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VADDSWS(PPUThread& CPU, ppu_opcode_t op)
{
for (uint w = 0; w < 4; w++)
{
s64 result = (s64)CPU.VPR[op.va]._s32[w] + (s64)CPU.VPR[op.vb]._s32[w];
if (result > 0x7fffffff)
{
CPU.VPR[op.vd]._s32[w] = 0x7fffffff;
}
else if (result < (s32)0x80000000)
{
CPU.VPR[op.vd]._s32[w] = 0x80000000;
}
else
CPU.VPR[op.vd]._s32[w] = (s32)result;
}
const auto a = CPU.VPR[op.va];
const auto b = CPU.VPR[op.vb];
const auto s = u128::add32(a, b); // a + b
const auto m = (a ^ s) & (b ^ s); // overflow bit
const auto x = _mm_srai_epi32(m.vi, 31); // saturation mask
const auto y = _mm_srai_epi32(_mm_and_si128(s.vi, m.vi), 31); // positive saturation mask
CPU.VPR[op.vd].vi = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s.vi, x));
}
void ppu_interpreter::VADDUBM(PPUThread& CPU, ppu_opcode_t op)
@ -139,17 +102,7 @@ void ppu_interpreter::VADDUBM(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VADDUBS(PPUThread& CPU, ppu_opcode_t op)
{
for (uint b = 0; b < 16; b++)
{
u16 result = (u16)CPU.VPR[op.va]._u8[b] + (u16)CPU.VPR[op.vb]._u8[b];
if (result > 0xff)
{
CPU.VPR[op.vd]._u8[b] = 0xff;
}
else
CPU.VPR[op.vd]._u8[b] = (u8)result;
}
CPU.VPR[op.vd].vi = _mm_adds_epu8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VADDUHM(PPUThread& CPU, ppu_opcode_t op)
@ -159,17 +112,7 @@ void ppu_interpreter::VADDUHM(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VADDUHS(PPUThread& CPU, ppu_opcode_t op)
{
for (uint h = 0; h < 8; h++)
{
u32 result = (u32)CPU.VPR[op.va]._u16[h] + (u32)CPU.VPR[op.vb]._u16[h];
if (result > 0xffff)
{
CPU.VPR[op.vd]._u16[h] = 0xffff;
}
else
CPU.VPR[op.vd]._u16[h] = result;
}
CPU.VPR[op.vd].vi = _mm_adds_epu16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VADDUWM(PPUThread& CPU, ppu_opcode_t op)
@ -179,17 +122,9 @@ void ppu_interpreter::VADDUWM(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VADDUWS(PPUThread& CPU, ppu_opcode_t op)
{
for (uint w = 0; w < 4; w++)
{
u64 result = (u64)CPU.VPR[op.va]._u32[w] + (u64)CPU.VPR[op.vb]._u32[w];
if (result > 0xffffffff)
{
CPU.VPR[op.vd]._u32[w] = 0xffffffff;
}
else
CPU.VPR[op.vd]._u32[w] = (u32)result;
}
const auto a = CPU.VPR[op.va].vi;
const auto b = CPU.VPR[op.vb].vi;
CPU.VPR[op.vd].vi = _mm_or_si128(_mm_add_epi32(a, b), _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(0x80000000)), _mm_xor_si128(a, _mm_set1_epi32(0x7fffffff))));
}
void ppu_interpreter::VAND(PPUThread& CPU, ppu_opcode_t op)
@ -228,16 +163,12 @@ void ppu_interpreter::VAVGSW(PPUThread& CPU, ppu_opcode_t op)
void ppu_interpreter::VAVGUB(PPUThread& CPU, ppu_opcode_t op)
{
for (uint b = 0; b < 16; b++)
CPU.VPR[op.vd]._u8[b] = (CPU.VPR[op.va]._u8[b] + CPU.VPR[op.vb]._u8[b] + 1) >> 1;
CPU.VPR[op.vd].vi = _mm_avg_epu8(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VAVGUH(PPUThread& CPU, ppu_opcode_t op)
{
for (uint h = 0; h < 8; h++)
{
CPU.VPR[op.vd]._u16[h] = (CPU.VPR[op.va]._u16[h] + CPU.VPR[op.vb]._u16[h] + 1) >> 1;
}
CPU.VPR[op.vd].vi = _mm_avg_epu16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi);
}
void ppu_interpreter::VAVGUW(PPUThread& CPU, ppu_opcode_t op)

View File

@ -91,10 +91,7 @@ void spu_interpreter::OR(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::BG(SPUThread& CPU, spu_opcode_t op)
{
for (u32 i = 0; i < 4; i++)
{
CPU.GPR[op.rt]._u32[i] = CPU.GPR[op.ra]._u32[i] <= CPU.GPR[op.rb]._u32[i];
}
CPU.GPR[op.rt].vi = _mm_add_epi32(sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi), _mm_set1_epi32(1));
}
void spu_interpreter::SFH(SPUThread& CPU, spu_opcode_t op)
@ -264,10 +261,9 @@ void spu_interpreter::AND(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op)
{
for (u32 i = 0; i < 4; i++)
{
CPU.GPR[op.rt]._u32[i] = ~CPU.GPR[op.ra]._u32[i] < CPU.GPR[op.rb]._u32[i];
}
const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff));
const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000));
CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
}
void spu_interpreter::AH(SPUThread& CPU, spu_opcode_t op)
@ -665,8 +661,7 @@ void spu_interpreter::XSBH(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::CLGT(SPUThread& CPU, spu_opcode_t op)
{
const auto sign = _mm_set1_epi32(0x80000000);
CPU.GPR[op.rt].vi = _mm_cmpgt_epi32(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
CPU.GPR[op.rt].vi = sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
}
void spu_interpreter::ANDC(SPUThread& CPU, spu_opcode_t op)
@ -701,8 +696,7 @@ void spu_interpreter::FM(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::CLGTH(SPUThread& CPU, spu_opcode_t op)
{
const auto sign = _mm_set1_epi32(0x80008000);
CPU.GPR[op.rt].vi = _mm_cmpgt_epi16(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
CPU.GPR[op.rt].vi = sse_cmpgt_epu16(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
}
void spu_interpreter::ORC(SPUThread& CPU, spu_opcode_t op)
@ -738,8 +732,7 @@ void spu_interpreter::DFM(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::CLGTB(SPUThread& CPU, spu_opcode_t op)
{
const auto sign = _mm_set1_epi32(0x80808080);
CPU.GPR[op.rt].vi = _mm_cmpgt_epi8(_mm_xor_si128(CPU.GPR[op.ra].vi, sign), _mm_xor_si128(CPU.GPR[op.rb].vi, sign));
CPU.GPR[op.rt].vi = sse_cmpgt_epu8(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
}
void spu_interpreter::HLGT(SPUThread& CPU, spu_opcode_t op)