SPU: rewrite spu_interpreter::SHUFB

Use ASMJIT to generate SSSE3+ code at runtime Remove static SSSE3 code from spu_interpreter
2025-03-10 16:14:29 +00:00 · 2018-05-27 23:37:01 +03:00 · 2018-05-27 23:37:01 +03:00 · bdf6545571
commit bdf6545571
parent bebb1bdeda
3 changed files with 167 additions and 193 deletions
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "Emu/Memory/Memory.h"
 #include "Emu/System.h"
+#include "Utilities/JIT.h"
+#include "Utilities/sysinfo.h"

 #include "SPUThread.h"
 #include "SPUInterpreter.h"
@ -8,10 +10,6 @@
 #include <cmath>
 #include <cfenv>

-#if !defined(_MSC_VER) && !defined(__SSSE3__)
-#define _mm_shuffle_epi8
-#endif
-
 // Compare 16 packed unsigned bytes (greater than)
 inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
 {
@ -32,6 +30,59 @@ inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
 	return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
 }

+namespace asmjit
+{
+	template <uint I, uint N>
+	static void build_spu_gpr_load(X86Assembler& c, X86Xmm x, const bf_t<u32, I, N>& reg, bool store = false)
+	{
+		static_assert(N == 7, "Invalid bitfield");
+
+#ifdef _WIN32
+		const auto& spu = x86::rcx;
+		const auto& op = x86::edx;
+#else
+		const auto& spu = x86::rdi;
+		const auto& op = x86::esi;
+#endif
+
+		c.mov(x86::eax, op);
+
+		if (I >= 4)
+		{
+			c.shr(x86::eax, I - 4);
+			c.and_(x86::eax, 0x7f << 4);
+		}
+		else
+		{
+			c.and_(x86::eax, 0x7f);
+			c.shl(x86::eax, I + 4);
+		}
+
+		const auto ptr = x86::oword_ptr(spu, x86::rax, 0, ::offset32(&SPUThread::gpr));
+
+		if (utils::has_avx())
+		{
+			if (store)
+				c.vmovdqa(ptr, x);
+			else
+				c.vmovdqa(x, ptr);
+		}
+		else
+		{
+			if (store)
+				c.movdqa(ptr, x);
+			else
+				c.movdqa(x, ptr);
+		}
+	}
+
+	template <uint I, uint N>
+	static void build_spu_gpr_store(X86Assembler& c, X86Xmm x, const bf_t<u32, I, N>& reg, bool store = true)
+	{
+		build_spu_gpr_load(c, x, reg, store);
+	}
+}
+
 bool spu_interpreter::UNK(SPUThread& spu, spu_opcode_t op)
 {
 	fmt::throw_exception("Unknown/Illegal instruction (0x%08x)" HERE, op.opcode);
@ -497,7 +548,7 @@ bool spu_interpreter::LQX(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(32) const __m128i buf[2]{a, a};
@ -505,13 +556,7 @@ bool spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -519,13 +564,7 @@ bool spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] >> 3 & 0x1f].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -533,12 +572,6 @@ bool spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi);
-	return true;
-}
-
 bool spu_interpreter::CBX(SPUThread& spu, spu_opcode_t op)
 {
 	if (op.ra == 1 && (spu.gpr[1]._u32[3] & 0xF))
@ -615,7 +648,7 @@ bool spu_interpreter::SHLQBI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQBY(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(32) const __m128i buf[2]{a, a};
@ -623,13 +656,7 @@ bool spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQBY(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQMBY(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -637,13 +664,7 @@ bool spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQMBY(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] & 0x1f].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::SHLQBY(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -651,12 +672,6 @@ bool spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::SHLQBY(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi);
-	return true;
-}
-
 bool spu_interpreter::ORX(SPUThread& spu, spu_opcode_t op)
 {
 	spu.gpr[op.rt] = v128::from32r(spu.gpr[op.ra]._u32[0] | spu.gpr[op.ra]._u32[1] | spu.gpr[op.ra]._u32[2] | spu.gpr[op.ra]._u32[3]);
@ -739,7 +754,7 @@ bool spu_interpreter::SHLQBII(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQBYI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(32) const __m128i buf[2]{a, a};
@ -747,13 +762,7 @@ bool spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQBYI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
@ -761,13 +770,7 @@ bool spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[op.i7 & 0x1f].vi);
-	return true;
-}
-
-bool spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
+bool spu_interpreter::SHLQBYI(SPUThread& spu, spu_opcode_t op)
 {
 	const auto a = spu.gpr[op.ra].vi;
 	alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
@ -775,12 +778,6 @@ bool spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_fast::SHLQBYI(SPUThread& spu, spu_opcode_t op)
-{
-	spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
-	return true;
-}
-
 bool spu_interpreter::NOP(SPUThread& spu, spu_opcode_t op)
 {
 	return true;
@ -1637,64 +1634,100 @@ bool spu_interpreter::SELB(SPUThread& spu, spu_opcode_t op)
 	return true;
 }

-bool spu_interpreter_precise::SHUFB(SPUThread& spu, spu_opcode_t op)
+static bool SHUFB_(SPUThread& spu, spu_opcode_t op)
 {
-	alignas(16) static thread_local u8 s_lut[256]
-	{
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-	};
-
-	const auto _a = spu.gpr[op.ra].vi;
-	const auto _b = spu.gpr[op.rb].vi;
-	_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
-	_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
-	_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
-	_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
-	_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
-	_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
-	_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
-	_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
-	v128 mask = v128::fromV(_mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi8(0xf)));
-	auto& t = spu.gpr[op.rt4];
+	__m128i ab[2]{spu.gpr[op.rb].vi, spu.gpr[op.ra].vi};
+	v128 c = spu.gpr[op.rc];
+	v128 x = v128::fromV(_mm_andnot_si128(c.vi, _mm_set1_epi8(0x1f)));
+	v128 res;

+	// Select bytes
 	for (int i = 0; i < 16; i++)
 	{
-		t._u8[i] = s_lut[mask._u8[i]];
+		res._u8[i] = ((u8*)+ab)[x._u8[i]];
 	}
+
+	// Select special values
+	const auto xc0 = _mm_set1_epi8(0xc0);
+	const auto xe0 = _mm_set1_epi8(0xe0);
+	const auto cmp0 = _mm_cmpgt_epi8(_mm_setzero_si128(), c.vi);
+	const auto cmp1 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xc0), xc0);
+	const auto cmp2 = _mm_cmpeq_epi8(_mm_and_si128(c.vi, xe0), xc0);
+	spu.gpr[op.rt4].vi = _mm_or_si128(_mm_andnot_si128(cmp0, res.vi), _mm_avg_epu8(cmp1, cmp2));
 	return true;
 }

-bool spu_interpreter_fast::SHUFB(SPUThread& spu, spu_opcode_t op)
+const spu_inter_func_t spu_interpreter::SHUFB = !utils::has_ssse3() ? &SHUFB_ : build_function_asm<spu_inter_func_t>([](asmjit::X86Assembler& c, auto& args)
 {
-	const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f));
-	const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index);
-	const auto bit4 = _mm_set1_epi32(0x10101010);
-	const auto k1 = _mm_cmpeq_epi8(_mm_and_si128(index, bit4), bit4);
-	const auto res2 = _mm_or_si128(_mm_and_si128(k1, _mm_shuffle_epi8(spu.gpr[op.rb].vi, index)), _mm_andnot_si128(k1, res1));
-	const auto bit67 = _mm_set1_epi32(0xc0c0c0c0);
-	const auto k2 = _mm_cmpeq_epi8(_mm_and_si128(index, bit67), bit67);
-	const auto res3 = _mm_or_si128(res2, k2);
-	const auto bit567 = _mm_set1_epi32(0xe0e0e0e0);
-	const auto k3 = _mm_cmpeq_epi8(_mm_and_si128(index, bit567), bit567);
-	spu.gpr[op.rt4].vi = _mm_sub_epi8(res3, _mm_and_si128(k3, _mm_set1_epi32(0x7f7f7f7f)));
-	return true;
-}
+	using namespace asmjit;
+
+	const auto& va = x86::xmm0;
+	const auto& vb = x86::xmm1;
+	const auto& vc = x86::xmm2;
+	const auto& vt = x86::xmm3;
+	const auto& vm = x86::xmm4;
+	const auto& v5 = x86::xmm5;
+
+	Label xc0 = c.newLabel();
+	Label xe0 = c.newLabel();
+	Label x0f = c.newLabel();
+
+	build_spu_gpr_load(c, va, decltype(spu_opcode_t::ra)());
+	build_spu_gpr_load(c, vb, decltype(spu_opcode_t::rb)());
+	build_spu_gpr_load(c, vc, decltype(spu_opcode_t::rc)());
+
+	if (utils::has_avx())
+	{
+		c.vpand(v5, vc, x86::oword_ptr(xe0));
+		c.vpxor(vc, vc, x86::oword_ptr(x0f));
+		c.vpshufb(va, va, vc);
+		c.vpslld(vt, vc, 3);
+		c.vmovdqa(vm, x86::oword_ptr(xc0));
+		c.vpcmpeqb(v5, v5, vm);
+		c.vpshufb(vb, vb, vc);
+		c.vpand(vc, vc, vm);
+		c.vpblendvb(vb, va, vb, vt);
+		c.vpcmpeqb(vt, vc, vm);
+		c.vpavgb(vt, vt, v5);
+		c.vpor(vt, vt, vb);
+	}
+	else
+	{
+		c.movdqa(v5, vc);
+		c.pand(v5, x86::oword_ptr(xe0));
+		c.movdqa(vt, vc);
+		c.movdqa(vm, x86::oword_ptr(xc0));
+		c.pand(vt, vm);
+		c.pxor(vc, x86::oword_ptr(x0f));
+		c.pshufb(va, vc);
+		c.pshufb(vb, vc);
+		c.pslld(vc, 3);
+		c.pcmpeqb(v5, vm);
+		c.pcmpeqb(vt, vm);
+		c.pcmpeqb(vm, vm);
+		c.pcmpgtb(vc, vm);
+		c.pand(va, vc);
+		c.pandn(vc, vb);
+		c.por(vc, va);
+		c.pavgb(vt, v5);
+		c.por(vt, vc);
+	}
+
+	build_spu_gpr_store(c, vt, decltype(spu_opcode_t::rt4)());
+	c.mov(x86::eax, 1);
+	c.ret();
+
+	c.align(kAlignData, 16);
+	c.bind(xc0);
+	c.dq(0xc0c0c0c0c0c0c0c0);
+	c.dq(0xc0c0c0c0c0c0c0c0);
+	c.bind(xe0);
+	c.dq(0xe0e0e0e0e0e0e0e0);
+	c.dq(0xe0e0e0e0e0e0e0e0);
+	c.bind(x0f);
+	c.dq(0x0f0f0f0f0f0f0f0f);
+	c.dq(0x0f0f0f0f0f0f0f0f);
+});

 bool spu_interpreter::MPYA(SPUThread& spu, spu_opcode_t op)
 {
@ -2551,3 +2584,7 @@ bool spu_interpreter_precise::FNMS(SPUThread& spu, spu_opcode_t op) { ::FMA(spu,
 bool spu_interpreter_precise::FMA(SPUThread& spu, spu_opcode_t op) { ::FMA(spu, op, false, false); return true; }

 bool spu_interpreter_precise::FMS(SPUThread& spu, spu_opcode_t op) { ::FMA(spu, op, false, true); return true; }
+
+extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise{};
+
+extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast{};
--- a/rpcs3/Emu/Cell/SPUInterpreter.h
+++ b/rpcs3/Emu/Cell/SPUInterpreter.h
@ -66,6 +66,9 @@ struct spu_interpreter
 	static bool FSMH(SPUThread&, spu_opcode_t);
 	static bool FSMB(SPUThread&, spu_opcode_t);
 	static bool LQX(SPUThread&, spu_opcode_t);
+	static bool ROTQBYBI(SPUThread&, spu_opcode_t);
+	static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
+	static bool SHLQBYBI(SPUThread&, spu_opcode_t);
 	static bool CBX(SPUThread&, spu_opcode_t);
 	static bool CHX(SPUThread&, spu_opcode_t);
 	static bool CWX(SPUThread&, spu_opcode_t);
@ -73,6 +76,9 @@ struct spu_interpreter
 	static bool ROTQBI(SPUThread&, spu_opcode_t);
 	static bool ROTQMBI(SPUThread&, spu_opcode_t);
 	static bool SHLQBI(SPUThread&, spu_opcode_t);
+	static bool ROTQBY(SPUThread&, spu_opcode_t);
+	static bool ROTQMBY(SPUThread&, spu_opcode_t);
+	static bool SHLQBY(SPUThread&, spu_opcode_t);
 	static bool ORX(SPUThread&, spu_opcode_t);
 	static bool CBD(SPUThread&, spu_opcode_t);
 	static bool CHD(SPUThread&, spu_opcode_t);
@ -81,6 +87,9 @@ struct spu_interpreter
 	static bool ROTQBII(SPUThread&, spu_opcode_t);
 	static bool ROTQMBII(SPUThread&, spu_opcode_t);
 	static bool SHLQBII(SPUThread&, spu_opcode_t);
+	static bool ROTQBYI(SPUThread&, spu_opcode_t);
+	static bool ROTQMBYI(SPUThread&, spu_opcode_t);
+	static bool SHLQBYI(SPUThread&, spu_opcode_t);
 	static bool NOP(SPUThread&, spu_opcode_t);
 	static bool CGT(SPUThread&, spu_opcode_t);
 	static bool XOR(SPUThread&, spu_opcode_t);
@ -166,6 +175,7 @@ struct spu_interpreter
 	static bool HBRR(SPUThread&, spu_opcode_t);
 	static bool ILA(SPUThread&, spu_opcode_t);
 	static bool SELB(SPUThread&, spu_opcode_t);
+	static const spu_inter_func_t SHUFB;
 	static bool MPYA(SPUThread&, spu_opcode_t);
 	static bool DFCGT(SPUThread&, spu_opcode_t);
 	static bool DFCMGT(SPUThread&, spu_opcode_t);
@ -176,17 +186,6 @@ struct spu_interpreter

 struct spu_interpreter_fast final : spu_interpreter
 {
-	static bool ROTQBYBI(SPUThread&, spu_opcode_t);
-	static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
-	static bool SHLQBYBI(SPUThread&, spu_opcode_t);
-	static bool ROTQBY(SPUThread&, spu_opcode_t);
-	static bool ROTQMBY(SPUThread&, spu_opcode_t);
-	static bool SHLQBY(SPUThread&, spu_opcode_t);
-	static bool ROTQBYI(SPUThread&, spu_opcode_t);
-	static bool ROTQMBYI(SPUThread&, spu_opcode_t);
-	static bool SHLQBYI(SPUThread&, spu_opcode_t);
-	static bool SHUFB(SPUThread&, spu_opcode_t);
-
 	static bool FREST(SPUThread&, spu_opcode_t);
 	static bool FRSQEST(SPUThread&, spu_opcode_t);
 	static bool FCGT(SPUThread&, spu_opcode_t);
@ -219,17 +218,6 @@ struct spu_interpreter_fast final : spu_interpreter

 struct spu_interpreter_precise final : spu_interpreter
 {
-	static bool ROTQBYBI(SPUThread&, spu_opcode_t);
-	static bool ROTQMBYBI(SPUThread&, spu_opcode_t);
-	static bool SHLQBYBI(SPUThread&, spu_opcode_t);
-	static bool ROTQBY(SPUThread&, spu_opcode_t);
-	static bool ROTQMBY(SPUThread&, spu_opcode_t);
-	static bool SHLQBY(SPUThread&, spu_opcode_t);
-	static bool ROTQBYI(SPUThread&, spu_opcode_t);
-	static bool ROTQMBYI(SPUThread&, spu_opcode_t);
-	static bool SHLQBYI(SPUThread&, spu_opcode_t);
-	static bool SHUFB(SPUThread&, spu_opcode_t);
-
 	static bool FREST(SPUThread&, spu_opcode_t);
 	static bool FRSQEST(SPUThread&, spu_opcode_t);
 	static bool FCGT(SPUThread&, spu_opcode_t);
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -45,63 +45,12 @@ bool operator ==(const u128& lhs, const u128& rhs)
 extern u64 get_timebased_time();
 extern u64 get_system_time();

+extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise;
+
+extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast;
+
 extern thread_local u64 g_tls_fault_spu;

-// Table of identical interpreter functions when precise contains SSE2 version, and fast contains SSSE3 functions
-const std::pair<spu_inter_func_t, spu_inter_func_t> s_spu_dispatch_table[]
-{
-#define FUNC(x) {&spu_interpreter_precise::x, &spu_interpreter_fast::x}
-	FUNC(ROTQBYBI),
-	FUNC(ROTQMBYBI),
-	FUNC(SHLQBYBI),
-	FUNC(ROTQBY),
-	FUNC(ROTQMBY),
-	FUNC(SHLQBY),
-	FUNC(ROTQBYI),
-	FUNC(ROTQMBYI),
-	FUNC(SHLQBYI),
-	FUNC(SHUFB),
-#undef FUNC
-};
-
-extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise([](auto& table)
-{
-	if (s_use_ssse3)
-	{
-		for (auto& func : table)
-		{
-			for (const auto& pair : s_spu_dispatch_table)
-			{
-				if (pair.first == func)
-				{
-					func = pair.second;
-					break;
-				}
-			}
-		}
-	}
-});
-
-extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast([](auto& table)
-{
-	if (!s_use_ssse3)
-	{
-		for (auto& func : table)
-		{
-			for (const auto& pair : s_spu_dispatch_table)
-			{
-				if (pair.second == func)
-				{
-					func = pair.first;
-					break;
-				}
-			}
-		}
-	}
-});
-
-std::atomic<u64> g_num_spu_threads{0ull};
-
 template <>
 void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
 {