From aa7c9dd15d85b51506516f6e4c68906005d3b987 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 20 Jan 2022 22:57:21 +0300 Subject: [PATCH] PPU: rewrite LVSL/LVSR instructions Make the tables endian-agnostic. --- rpcs3/Emu/Cell/PPUInterpreter.cpp | 122 +++++++++++++----------------- 1 file changed, 52 insertions(+), 70 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 2a615a5146..eda93c446e 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -500,70 +500,6 @@ inline u32 ppu_record_call(ppu_thread& ppu, u32 new_cia, ppu_opcode_t op, bool i } } -extern SAFE_BUFFERS(__m128i) sse_pshufb(__m128i data, __m128i index) -{ - v128 m = _mm_and_si128(index, _mm_set1_epi8(0xf)); - v128 a = data; - v128 r; - - for (int i = 0; i < 16; i++) - { - r._u8[i] = a._u8[m._u8[i]]; - } - - return _mm_and_si128(r, _mm_cmpgt_epi8(index, _mm_set1_epi8(-1))); -} - -extern __m128i sse_altivec_lvsl(u64 addr) -{ - alignas(16) static const u8 lvsl_values[0x10][0x10] = - { - { 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }, - { 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01 }, - { 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02 }, - { 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03 }, - { 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04 }, - { 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05 }, - { 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06 }, - { 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07 }, - { 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 }, - { 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09 }, - { 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a }, - { 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b }, - { 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c }, - { 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d }, - { 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e }, - { 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f }, - }; - - return _mm_load_si128(reinterpret_cast(+lvsl_values[addr & 0xf])); -} - -extern __m128i sse_altivec_lvsr(u64 addr) -{ - alignas(16) static const u8 lvsr_values[0x10][0x10] = - { - { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10 }, - { 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f }, - { 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e }, - { 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d }, - { 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c }, - { 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b }, - { 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a }, - { 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09 }, - { 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 }, - { 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07 }, - { 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06 }, - { 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05 }, - { 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04 }, - { 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03 }, - { 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02 }, - { 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01 }, - }; - - return _mm_load_si128(reinterpret_cast(+lvsr_values[addr & 0xf])); -} - template struct add_flags_result_t { @@ -3739,16 +3675,40 @@ auto TW() }; } +const v128 s_lvsl_base = v128::from64r(0x0001020304050607, 0x08090a0b0c0d0e0f); + +const v128 s_lvsl_consts[16] = +{ + gv_add8(s_lvsl_base, gv_bcst8(0)), + gv_add8(s_lvsl_base, gv_bcst8(1)), + gv_add8(s_lvsl_base, gv_bcst8(2)), + gv_add8(s_lvsl_base, gv_bcst8(3)), + gv_add8(s_lvsl_base, gv_bcst8(4)), + gv_add8(s_lvsl_base, gv_bcst8(5)), + gv_add8(s_lvsl_base, gv_bcst8(6)), + gv_add8(s_lvsl_base, gv_bcst8(7)), + gv_add8(s_lvsl_base, gv_bcst8(8)), + gv_add8(s_lvsl_base, gv_bcst8(9)), + gv_add8(s_lvsl_base, gv_bcst8(10)), + gv_add8(s_lvsl_base, gv_bcst8(11)), + gv_add8(s_lvsl_base, gv_bcst8(12)), + gv_add8(s_lvsl_base, gv_bcst8(13)), + gv_add8(s_lvsl_base, gv_bcst8(14)), + gv_add8(s_lvsl_base, gv_bcst8(15)), +}; + template auto LVSL() { if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - ppu.vr[op.vd] = sse_altivec_lvsl(addr); + static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) + { + const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; + ppu.vr[op.vd] = s_lvsl_consts[addr % 16]; }; + RETURN_(ppu, op); } @@ -3982,16 +3942,38 @@ auto CMPL() RETURN_(ppu, op); } +const v128 s_lvsr_consts[16] = +{ + gv_add8(s_lvsl_base, gv_bcst8(16)), + gv_add8(s_lvsl_base, gv_bcst8(15)), + gv_add8(s_lvsl_base, gv_bcst8(14)), + gv_add8(s_lvsl_base, gv_bcst8(13)), + gv_add8(s_lvsl_base, gv_bcst8(12)), + gv_add8(s_lvsl_base, gv_bcst8(11)), + gv_add8(s_lvsl_base, gv_bcst8(10)), + gv_add8(s_lvsl_base, gv_bcst8(9)), + gv_add8(s_lvsl_base, gv_bcst8(8)), + gv_add8(s_lvsl_base, gv_bcst8(7)), + gv_add8(s_lvsl_base, gv_bcst8(6)), + gv_add8(s_lvsl_base, gv_bcst8(5)), + gv_add8(s_lvsl_base, gv_bcst8(4)), + gv_add8(s_lvsl_base, gv_bcst8(3)), + gv_add8(s_lvsl_base, gv_bcst8(2)), + gv_add8(s_lvsl_base, gv_bcst8(1)), +}; + template auto LVSR() { if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - ppu.vr[op.vd] = sse_altivec_lvsr(addr); + static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) + { + const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; + ppu.vr[op.vd] = s_lvsr_consts[addr % 16]; }; + RETURN_(ppu, op); }