PPU: rewrite LVSL/LVSR instructions

Make the tables endian-agnostic.
This commit is contained in:
Nekotekina 2022-01-20 22:57:21 +03:00
parent 628354ba92
commit aa7c9dd15d

View File

@ -500,70 +500,6 @@ inline u32 ppu_record_call(ppu_thread& ppu, u32 new_cia, ppu_opcode_t op, bool i
}
}
extern SAFE_BUFFERS(__m128i) sse_pshufb(__m128i data, __m128i index)
{
v128 m = _mm_and_si128(index, _mm_set1_epi8(0xf));
v128 a = data;
v128 r;
for (int i = 0; i < 16; i++)
{
r._u8[i] = a._u8[m._u8[i]];
}
return _mm_and_si128(r, _mm_cmpgt_epi8(index, _mm_set1_epi8(-1)));
}
extern __m128i sse_altivec_lvsl(u64 addr)
{
alignas(16) static const u8 lvsl_values[0x10][0x10] =
{
{ 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 },
{ 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01 },
{ 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02 },
{ 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03 },
{ 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04 },
{ 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05 },
{ 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06 },
{ 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07 },
{ 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 },
{ 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09 },
{ 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a },
{ 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b },
{ 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c },
{ 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d },
{ 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e },
{ 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f },
};
return _mm_load_si128(reinterpret_cast<const __m128i*>(+lvsl_values[addr & 0xf]));
}
extern __m128i sse_altivec_lvsr(u64 addr)
{
alignas(16) static const u8 lvsr_values[0x10][0x10] =
{
{ 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10 },
{ 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f },
{ 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e },
{ 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d },
{ 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c },
{ 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b },
{ 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a },
{ 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09 },
{ 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 },
{ 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07 },
{ 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06 },
{ 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05 },
{ 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04 },
{ 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03 },
{ 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02 },
{ 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01 },
};
return _mm_load_si128(reinterpret_cast<const __m128i*>(+lvsr_values[addr & 0xf]));
}
template<typename T>
struct add_flags_result_t
{
@ -3739,16 +3675,40 @@ auto TW()
};
}
const v128 s_lvsl_base = v128::from64r(0x0001020304050607, 0x08090a0b0c0d0e0f);
const v128 s_lvsl_consts[16] =
{
gv_add8(s_lvsl_base, gv_bcst8(0)),
gv_add8(s_lvsl_base, gv_bcst8(1)),
gv_add8(s_lvsl_base, gv_bcst8(2)),
gv_add8(s_lvsl_base, gv_bcst8(3)),
gv_add8(s_lvsl_base, gv_bcst8(4)),
gv_add8(s_lvsl_base, gv_bcst8(5)),
gv_add8(s_lvsl_base, gv_bcst8(6)),
gv_add8(s_lvsl_base, gv_bcst8(7)),
gv_add8(s_lvsl_base, gv_bcst8(8)),
gv_add8(s_lvsl_base, gv_bcst8(9)),
gv_add8(s_lvsl_base, gv_bcst8(10)),
gv_add8(s_lvsl_base, gv_bcst8(11)),
gv_add8(s_lvsl_base, gv_bcst8(12)),
gv_add8(s_lvsl_base, gv_bcst8(13)),
gv_add8(s_lvsl_base, gv_bcst8(14)),
gv_add8(s_lvsl_base, gv_bcst8(15)),
};
template <u32 Build, ppu_exec_bit... Flags>
auto LVSL()
{
if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
ppu.vr[op.vd] = sse_altivec_lvsl(addr);
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
{
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
ppu.vr[op.vd] = s_lvsl_consts[addr % 16];
};
RETURN_(ppu, op);
}
@ -3982,16 +3942,38 @@ auto CMPL()
RETURN_(ppu, op);
}
const v128 s_lvsr_consts[16] =
{
gv_add8(s_lvsl_base, gv_bcst8(16)),
gv_add8(s_lvsl_base, gv_bcst8(15)),
gv_add8(s_lvsl_base, gv_bcst8(14)),
gv_add8(s_lvsl_base, gv_bcst8(13)),
gv_add8(s_lvsl_base, gv_bcst8(12)),
gv_add8(s_lvsl_base, gv_bcst8(11)),
gv_add8(s_lvsl_base, gv_bcst8(10)),
gv_add8(s_lvsl_base, gv_bcst8(9)),
gv_add8(s_lvsl_base, gv_bcst8(8)),
gv_add8(s_lvsl_base, gv_bcst8(7)),
gv_add8(s_lvsl_base, gv_bcst8(6)),
gv_add8(s_lvsl_base, gv_bcst8(5)),
gv_add8(s_lvsl_base, gv_bcst8(4)),
gv_add8(s_lvsl_base, gv_bcst8(3)),
gv_add8(s_lvsl_base, gv_bcst8(2)),
gv_add8(s_lvsl_base, gv_bcst8(1)),
};
template <u32 Build, ppu_exec_bit... Flags>
auto LVSR()
{
if constexpr (Build == 0xf1a6)
return ppu_exec_select<Flags...>::template select<>();
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) {
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
ppu.vr[op.vd] = sse_altivec_lvsr(addr);
static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
{
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
ppu.vr[op.vd] = s_lvsr_consts[addr % 16];
};
RETURN_(ppu, op);
}