SPU: rewrite spu_interpreter::SHUFB

Use ASMJIT to generate SSSE3+ code at runtime
Remove static SSSE3 code from spu_interpreter
This commit is contained in:
Nekotekina 2018-05-27 23:37:01 +03:00
parent bebb1bdeda
commit bdf6545571
3 changed files with 167 additions and 193 deletions

View File

@ -1,6 +1,8 @@
#include "stdafx.h"
#include "Emu/Memory/Memory.h"
#include "Emu/System.h"
#include "Utilities/JIT.h"
#include "Utilities/sysinfo.h"
#include "SPUThread.h"
#include "SPUInterpreter.h"
@ -8,10 +10,6 @@
#include <cmath>
#include <cfenv>
#if !defined(_MSC_VER) && !defined(__SSSE3__)
#define _mm_shuffle_epi8
#endif
// Compare 16 packed unsigned bytes (greater than)
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
{
@ -32,6 +30,59 @@ inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
namespace asmjit
{
template <uint I, uint N>
static void build_spu_gpr_load(X86Assembler& c, X86Xmm x, const bf_t<u32, I, N>& reg, bool store = false)
{
static_assert(N == 7, "Invalid bitfield");
#ifdef _WIN32
const auto& spu = x86::rcx;
const auto& op = x86::edx;
#else
const auto& spu = x86::rdi;
const auto& op = x86::esi;
#endif
c.mov(x86::eax, op);
if (I >= 4)
{
c.shr(x86::eax, I - 4);
c.and_(x86::eax, 0x7f << 4);
}
else
{
c.and_(x86::eax, 0x7f);
c.shl(x86::eax, I + 4);
}
const auto ptr = x86::oword_ptr(spu, x86::rax, 0, ::offset32(&SPUThread::gpr));
if (utils::has_avx())
{
if (store)
c.vmovdqa(ptr, x);
else
c.vmovdqa(x, ptr);
}
else
{
if (store)
c.movdqa(ptr, x);
else
c.movdqa(x, ptr);
}
}
template <uint I, uint N>
static void build_spu_gpr_store(X86Assembler& c, X86Xmm x, const bf_t<u32, I, N>& reg, bool store = true)
{
build_spu_gpr_load(c, x, reg, store);
}
}
bool spu_interpreter::UNK(SPUThread& spu, spu_opcode_t op)
{
fmt::throw_exception("Unknown/Illegal instruction (0x%08x)" HERE, op.opcode);
@ -497,7 +548,7 @@ bool spu_interpreter::LQX(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(32) const __m128i buf[2]{a, a};
@ -505,13 +556,7 @@ bool spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi);
return true;
}
bool spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -519,13 +564,7 @@ bool spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] >> 3 & 0x1f].vi);
return true;
}
bool spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -533,12 +572,6 @@ bool spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi);
return true;
}
bool spu_interpreter::CBX(SPUThread& spu, spu_opcode_t op)
{
if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
@ -615,7 +648,7 @@ bool spu_interpreter::SHLQBI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQBY(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(32) const __m128i buf[2]{a, a};
@ -623,13 +656,7 @@ bool spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQBY(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi);
return true;
}
bool spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQMBY(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -637,13 +664,7 @@ bool spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQMBY(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] & 0x1f].vi);
return true;
}
bool spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::SHLQBY(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -651,12 +672,6 @@ bool spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::SHLQBY(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi);
return true;
}
bool spu_interpreter::ORX(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt] = v128::from32r(spu.gpr[op.ra]._u32[0] | spu.gpr[op.ra]._u32[1] | spu.gpr[op.ra]._u32[2] | spu.gpr[op.ra]._u32[3]);
@ -739,7 +754,7 @@ bool spu_interpreter::SHLQBII(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQBYI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(32) const __m128i buf[2]{a, a};
@ -747,13 +762,7 @@ bool spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQBYI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
return true;
}
bool spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -761,13 +770,7 @@ bool spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[op.i7 & 0x1f].vi);
return true;
}
bool spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
bool spu_interpreter::SHLQBYI(SPUThread& spu, spu_opcode_t op)
{
const auto a = spu.gpr[op.ra].vi;
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -775,12 +778,6 @@ bool spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_fast::SHLQBYI(SPUThread& spu, spu_opcode_t op)
{
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
return true;
}
bool spu_interpreter::NOP(SPUThread& spu, spu_opcode_t op)
{
return true;
@ -1637,64 +1634,100 @@ bool spu_interpreter::SELB(SPUThread& spu, spu_opcode_t op)
return true;
}
bool spu_interpreter_precise::SHUFB(SPUThread& spu, spu_opcode_t op)
static bool SHUFB_(SPUThread& spu, spu_opcode_t op)
{
alignas(16) static thread_local u8 s_lut[256]
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
};
const auto _a = spu.gpr[op.ra].vi;
const auto _b = spu.gpr[op.rb].vi;
_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
v128 mask = v128::fromV(_mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi8(0xf)));
auto& t = spu.gpr[op.rt4];
__m128i ab[2]{spu.gpr[op.rb].vi, spu.gpr[op.ra].vi};
v128 c = spu.gpr[op.rc];
v128 x = v128::fromV(_mm_andnot_si128(c.vi, _mm_set1_epi8(0x1f)));
v128 res;
// Select bytes
for (int i = 0; i < 16; i++)
{
t._u8[i] = s_lut[mask._u8[i]];
res._u8[i] = ((u8*)+ab)[x._u8[i]];
}
// Select special values
const auto xc0 = _mm_set1_epi8(0xc0);
const auto xe0 = _mm_set1_epi8(0xe0);
const auto cmp0 = _mm_cmpgt_epi8(_mm_setzero_si128(), c.vi);
const auto cmp1 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xc0), xc0);
const auto cmp2 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xe0), xc0);
spu.gpr[op.rt4].vi = _mm_or_si128(_mm_andnot_si128(cmp0, res.vi), _mm_avg_epu8(cmp1, cmp2));
return true;
}
bool spu_interpreter_fast::SHUFB(SPUThread& spu, spu_opcode_t op)
const spu_inter_func_t spu_interpreter::SHUFB = !utils::has_ssse3() ? &SHUFB_ : build_function_asm<spu_inter_func_t>([](asmjit::X86Assembler& c, auto& args)
{
const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f));
const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index);
const auto bit4 = _mm_set1_epi32(0x10101010);
const auto k1 = _mm_cmpeq_epi8(_mm_and_si128(index, bit4), bit4);
const auto res2 = _mm_or_si128(_mm_and_si128(k1, _mm_shuffle_epi8(spu.gpr[op.rb].vi, index)), _mm_andnot_si128(k1, res1));
const auto bit67 = _mm_set1_epi32(0xc0c0c0c0);
const auto k2 = _mm_cmpeq_epi8(_mm_and_si128(index, bit67), bit67);
const auto res3 = _mm_or_si128(res2, k2);
const auto bit567 = _mm_set1_epi32(0xe0e0e0e0);
const auto k3 = _mm_cmpeq_epi8(_mm_and_si128(index, bit567), bit567);
spu.gpr[op.rt4].vi = _mm_sub_epi8(res3, _mm_and_si128(k3, _mm_set1_epi32(0x7f7f7f7f)));
return true;
}
using namespace asmjit;
const auto& va = x86::xmm0;
const auto& vb = x86::xmm1;
const auto& vc = x86::xmm2;
const auto& vt = x86::xmm3;
const auto& vm = x86::xmm4;
const auto& v5 = x86::xmm5;
Label xc0 = c.newLabel();
Label xe0 = c.newLabel();
Label x0f = c.newLabel();
build_spu_gpr_load(c, va, decltype(spu_opcode_t::ra)());
build_spu_gpr_load(c, vb, decltype(spu_opcode_t::rb)());
build_spu_gpr_load(c, vc, decltype(spu_opcode_t::rc)());
if (utils::has_avx())
{
c.vpand(v5, vc, x86::oword_ptr(xe0));
c.vpxor(vc, vc, x86::oword_ptr(x0f));
c.vpshufb(va, va, vc);
c.vpslld(vt, vc, 3);
c.vmovdqa(vm, x86::oword_ptr(xc0));
c.vpcmpeqb(v5, v5, vm);
c.vpshufb(vb, vb, vc);
c.vpand(vc, vc, vm);
c.vpblendvb(vb, va, vb, vt);
c.vpcmpeqb(vt, vc, vm);
c.vpavgb(vt, vt, v5);
c.vpor(vt, vt, vb);
}
else
{
c.movdqa(v5, vc);
c.pand(v5, x86::oword_ptr(xe0));
c.movdqa(vt, vc);
c.movdqa(vm, x86::oword_ptr(xc0));
c.pand(vt, vm);
c.pxor(vc, x86::oword_ptr(x0f));
c.pshufb(va, vc);
c.pshufb(vb, vc);
c.pslld(vc, 3);
c.pcmpeqb(v5, vm);
c.pcmpeqb(vt, vm);
c.pcmpeqb(vm, vm);
c.pcmpgtb(vc, vm);
c.pand(va, vc);
c.pandn(vc, vb);
c.por(vc, va);
c.pavgb(vt, v5);
c.por(vt, vc);
}
build_spu_gpr_store(c, vt, decltype(spu_opcode_t::rt4)());
c.mov(x86::eax, 1);
c.ret();
c.align(kAlignData, 16);
c.bind(xc0);
c.dq(0xc0c0c0c0c0c0c0c0);
c.dq(0xc0c0c0c0c0c0c0c0);
c.bind(xe0);
c.dq(0xe0e0e0e0e0e0e0e0);
c.dq(0xe0e0e0e0e0e0e0e0);
c.bind(x0f);
c.dq(0x0f0f0f0f0f0f0f0f);
c.dq(0x0f0f0f0f0f0f0f0f);
});
bool spu_interpreter::MPYA(SPUThread& spu, spu_opcode_t op)
{
@ -2551,3 +2584,7 @@ bool spu_interpreter_precise::FNMS(SPUThread& spu, spu_opcode_t op) { ::FMA(spu,
bool spu_interpreter_precise::FMA(SPUThread& spu, spu_opcode_t op) { ::FMA(spu, op, false, false); return true; }
bool spu_interpreter_precise::FMS(SPUThread& spu, spu_opcode_t op) { ::FMA(spu, op, false, true); return true; }
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise{};
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast{};

View File

@ -66,6 +66,9 @@ struct spu_interpreter
static bool FSMH(SPUThread&, spu_opcode_t);
static bool FSMB(SPUThread&, spu_opcode_t);
static bool LQX(SPUThread&, spu_opcode_t);
static bool ROTQBYBI(SPUThread&, spu_opcode_t);
static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
static bool SHLQBYBI(SPUThread&, spu_opcode_t);
static bool CBX(SPUThread&, spu_opcode_t);
static bool CHX(SPUThread&, spu_opcode_t);
static bool CWX(SPUThread&, spu_opcode_t);
@ -73,6 +76,9 @@ struct spu_interpreter
static bool ROTQBI(SPUThread&, spu_opcode_t);
static bool ROTQMBI(SPUThread&, spu_opcode_t);
static bool SHLQBI(SPUThread&, spu_opcode_t);
static bool ROTQBY(SPUThread&, spu_opcode_t);
static bool ROTQMBY(SPUThread&, spu_opcode_t);
static bool SHLQBY(SPUThread&, spu_opcode_t);
static bool ORX(SPUThread&, spu_opcode_t);
static bool CBD(SPUThread&, spu_opcode_t);
static bool CHD(SPUThread&, spu_opcode_t);
@ -81,6 +87,9 @@ struct spu_interpreter
static bool ROTQBII(SPUThread&, spu_opcode_t);
static bool ROTQMBII(SPUThread&, spu_opcode_t);
static bool SHLQBII(SPUThread&, spu_opcode_t);
static bool ROTQBYI(SPUThread&, spu_opcode_t);
static bool ROTQMBYI(SPUThread&, spu_opcode_t);
static bool SHLQBYI(SPUThread&, spu_opcode_t);
static bool NOP(SPUThread&, spu_opcode_t);
static bool CGT(SPUThread&, spu_opcode_t);
static bool XOR(SPUThread&, spu_opcode_t);
@ -166,6 +175,7 @@ struct spu_interpreter
static bool HBRR(SPUThread&, spu_opcode_t);
static bool ILA(SPUThread&, spu_opcode_t);
static bool SELB(SPUThread&, spu_opcode_t);
static const spu_inter_func_t SHUFB;
static bool MPYA(SPUThread&, spu_opcode_t);
static bool DFCGT(SPUThread&, spu_opcode_t);
static bool DFCMGT(SPUThread&, spu_opcode_t);
@ -176,17 +186,6 @@ struct spu_interpreter
struct spu_interpreter_fast final : spu_interpreter
{
static bool ROTQBYBI(SPUThread&, spu_opcode_t);
static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
static bool SHLQBYBI(SPUThread&, spu_opcode_t);
static bool ROTQBY(SPUThread&, spu_opcode_t);
static bool ROTQMBY(SPUThread&, spu_opcode_t);
static bool SHLQBY(SPUThread&, spu_opcode_t);
static bool ROTQBYI(SPUThread&, spu_opcode_t);
static bool ROTQMBYI(SPUThread&, spu_opcode_t);
static bool SHLQBYI(SPUThread&, spu_opcode_t);
static bool SHUFB(SPUThread&, spu_opcode_t);
static bool FREST(SPUThread&, spu_opcode_t);
static bool FRSQEST(SPUThread&, spu_opcode_t);
static bool FCGT(SPUThread&, spu_opcode_t);
@ -219,17 +218,6 @@ struct spu_interpreter_fast final : spu_interpreter
struct spu_interpreter_precise final : spu_interpreter
{
static bool ROTQBYBI(SPUThread&, spu_opcode_t);
static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
static bool SHLQBYBI(SPUThread&, spu_opcode_t);
static bool ROTQBY(SPUThread&, spu_opcode_t);
static bool ROTQMBY(SPUThread&, spu_opcode_t);
static bool SHLQBY(SPUThread&, spu_opcode_t);
static bool ROTQBYI(SPUThread&, spu_opcode_t);
static bool ROTQMBYI(SPUThread&, spu_opcode_t);
static bool SHLQBYI(SPUThread&, spu_opcode_t);
static bool SHUFB(SPUThread&, spu_opcode_t);
static bool FREST(SPUThread&, spu_opcode_t);
static bool FRSQEST(SPUThread&, spu_opcode_t);
static bool FCGT(SPUThread&, spu_opcode_t);

View File

@ -45,63 +45,12 @@ bool operator ==(const u128& lhs, const u128& rhs)
extern u64 get_timebased_time();
extern u64 get_system_time();
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise;
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast;
extern thread_local u64 g_tls_fault_spu;
// Table of identical interpreter functions when precise contains SSE2 version, and fast contains SSSE3 functions
const std::pair<spu_inter_func_t, spu_inter_func_t> s_spu_dispatch_table[]
{
#define FUNC(x) {&spu_interpreter_precise::x, &spu_interpreter_fast::x}
FUNC(ROTQBYBI),
FUNC(ROTQMBYBI),
FUNC(SHLQBYBI),
FUNC(ROTQBY),
FUNC(ROTQMBY),
FUNC(SHLQBY),
FUNC(ROTQBYI),
FUNC(ROTQMBYI),
FUNC(SHLQBYI),
FUNC(SHUFB),
#undef FUNC
};
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise([](auto& table)
{
if (s_use_ssse3)
{
for (auto& func : table)
{
for (const auto& pair : s_spu_dispatch_table)
{
if (pair.first == func)
{
func = pair.second;
break;
}
}
}
}
});
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast([](auto& table)
{
if (!s_use_ssse3)
{
for (auto& func : table)
{
for (const auto& pair : s_spu_dispatch_table)
{
if (pair.second == func)
{
func = pair.first;
break;
}
}
}
}
});
std::atomic<u64> g_num_spu_threads{0ull};
template <>
void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
{