diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h index 6613a70109..c049a7e95a 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.h +++ b/rpcs3/Emu/Cell/PPUInterpreter.h @@ -181,7 +181,7 @@ private: CPU.VSCR.VSCR = CPU.VPR[vb]._u32[0]; CPU.VSCR.X = CPU.VSCR.Y = 0; } - void VADDCUW(u32 vd, u32 va, u32 vb) + void VADDCUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -195,7 +195,7 @@ private: CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] + CPU.VPR[vb]._f[w]; } } - void VADDSBS(u32 vd, u32 va, u32 vb) + void VADDSBS(u32 vd, u32 va, u32 vb) //nf { for(u32 b=0; b<16; ++b) { @@ -235,7 +235,7 @@ private: CPU.VPR[vd]._s16[h] = result; } } - void VADDSWS(u32 vd, u32 va, u32 vb) + void VADDSWS(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -335,21 +335,21 @@ private: CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] & (~CPU.VPR[vb]._u32[w]); } } - void VAVGSB(u32 vd, u32 va, u32 vb) + void VAVGSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { CPU.VPR[vd]._s8[b] = (CPU.VPR[va]._s8[b] + CPU.VPR[vb]._s8[b] + 1) >> 1; } } - void VAVGSH(u32 vd, u32 va, u32 vb) + void VAVGSH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._s16[h] = (CPU.VPR[va]._s16[h] + CPU.VPR[vb]._s16[h] + 1) >> 1; } } - void VAVGSW(u32 vd, u32 va, u32 vb) + void VAVGSW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -361,14 +361,14 @@ private: for (uint b = 0; b < 16; b++) CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] + CPU.VPR[vb]._u8[b] + 1) >> 1; } - void VAVGUH(u32 vd, u32 va, u32 vb) + void VAVGUH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._u16[h] = (CPU.VPR[va]._u16[h] + CPU.VPR[vb]._u16[h] + 1) >> 1; } } - void VAVGUW(u32 vd, u32 va, u32 vb) + void VAVGUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -487,14 +487,14 @@ private: CPU.CR.cr6 = all_equal | none_equal; } - void VCMPEQUH(u32 vd, u32 va, u32 vb) + void VCMPEQUH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] == CPU.VPR[vb]._u16[h] ? 0xffff : 0; } } - void VCMPEQUH_(u32 vd, u32 va, u32 vb) + void VCMPEQUH_(u32 vd, u32 va, u32 vb) //nf { int all_equal = 0x8; int none_equal = 0x2; @@ -599,7 +599,7 @@ private: CPU.CR.cr6 = all_ge | none_ge; } - void VCMPGTSB(u32 vd, u32 va, u32 vb) + void VCMPGTSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -833,7 +833,7 @@ private: CPU.VPR[vd]._f[w] = max(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); } } - void VMAXSB(u32 vd, u32 va, u32 vb) + void VMAXSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) CPU.VPR[vd]._s8[b] = max(CPU.VPR[va]._s8[b], CPU.VPR[vb]._s8[b]); @@ -918,7 +918,7 @@ private: CPU.VPR[vd]._f[w] = min(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); } } - void VMINSB(u32 vd, u32 va, u32 vb) + void VMINSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1021,7 +1021,7 @@ private: CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d]; } } - void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1036,7 +1036,7 @@ private: CPU.VPR[vd]._s32[w] = result; } } - void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1051,7 +1051,7 @@ private: CPU.VPR[vd]._s32[w] = result; } } - void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1096,7 +1096,7 @@ private: CPU.VPR[vd]._u32[w] = result; } } - void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1111,7 +1111,7 @@ private: CPU.VPR[vd]._u32[w] = result; } } - void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1136,7 +1136,7 @@ private: CPU.VPR[vd]._u32[w] = saturated; } } - void VMULESB(u32 vd, u32 va, u32 vb) + void VMULESB(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1164,7 +1164,7 @@ private: CPU.VPR[vd]._u32[w] = (u32)CPU.VPR[va]._u16[w*2+1] * (u32)CPU.VPR[vb]._u16[w*2+1]; } } - void VMULOSB(u32 vd, u32 va, u32 vb) + void VMULOSB(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1243,7 +1243,7 @@ private: CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24; } } - void VPKSHSS(u32 vd, u32 va, u32 vb) + void VPKSHSS(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 8; b++) { @@ -1348,7 +1348,7 @@ private: CPU.VPR[vd]._s16[h] = result; } } - void VPKSWUS(u32 vd, u32 va, u32 vb) + void VPKSWUS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 4; h++) { @@ -1383,7 +1383,7 @@ private: CPU.VPR[vd]._u16[h] = result; } } - void VPKUHUM(u32 vd, u32 va, u32 vb) + void VPKUHUM(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 8; b++) { @@ -1424,7 +1424,7 @@ private: CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2]; } } - void VPKUWUS(u32 vd, u32 va, u32 vb) + void VPKUWUS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 4; h++) { @@ -1486,7 +1486,7 @@ private: CPU.VPR[vd]._f[w] = f; } } - void VRLB(u32 vd, u32 va, u32 vb) + void VRLB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1495,7 +1495,7 @@ private: CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << nRot) | (CPU.VPR[va]._u8[b] >> (8 - nRot)); } } - void VRLH(u32 vd, u32 va, u32 vb) + void VRLH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1524,7 +1524,7 @@ private: CPU.VPR[vd]._u8[b] = (CPU.VPR[vb]._u8[b] & CPU.VPR[vc]._u8[b]) | (CPU.VPR[va]._u8[b] & (~CPU.VPR[vc]._u8[b])); } } - void VSL(u32 vd, u32 va, u32 vb) + void VSL(u32 vd, u32 va, u32 vb) //nf { u8 sh = CPU.VPR[vb]._u8[0] & 0x7; @@ -1648,7 +1648,7 @@ private: CPU.VPR[vd]._u32[w] = word; } } - void VSR(u32 vd, u32 va, u32 vb) + void VSR(u32 vd, u32 va, u32 vb) //nf { u8 sh = CPU.VPR[vb]._u8[0] & 0x7; u32 t = 1; @@ -1676,7 +1676,7 @@ private: CPU.VPR[vd]._u32[3] = 0xCDCDCDCD; } } - void VSRAB(u32 vd, u32 va, u32 vb) + void VSRAB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1729,7 +1729,7 @@ private: CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f); } } - void VSUBCUW(u32 vd, u32 va, u32 vb) + void VSUBCUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -1743,7 +1743,7 @@ private: CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] - CPU.VPR[vb]._f[w]; } } - void VSUBSBS(u32 vd, u32 va, u32 vb) + void VSUBSBS(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1832,7 +1832,7 @@ private: CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] - CPU.VPR[vb]._u16[h]; } } - void VSUBUHS(u32 vd, u32 va, u32 vb) + void VSUBUHS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1915,7 +1915,7 @@ private: CPU.VPR[vd]._s32[1] = 0; CPU.VPR[vd]._s32[3] = 0; } - void VSUM4SBS(u32 vd, u32 va, u32 vb) + void VSUM4SBS(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -2019,7 +2019,7 @@ private: CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f; } } - void VUPKLSB(u32 vd, u32 vb) + void VUPKLSB(u32 vd, u32 vb) //nf { for (uint h = 0; h < 8; h++) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 5d6e3ca906..4d07456c0c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -14,6 +14,58 @@ using namespace asmjit::host; #define UNIMPLEMENTED() UNK(__FUNCTION__) +struct g_imm_table_struct +{ + u16 cntb_table[65536]; + + __m128i fsmb_table[65536]; + __m128i fsmh_table[256]; + __m128i fsm_table[16]; + + __m128i sldq_pshufb[32]; + __m128i srdq_pshufb[32]; + __m128i rldq_pshufb[16]; + + g_imm_table_struct() + { + static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0"); + for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++) + { + u32 cnt_low = 0, cnt_high = 0; + for (u32 j = 0; j < 8; j++) + { + cnt_low += (i >> j) & 1; + cnt_high += (i >> (j + 8)) & 1; + } + cntb_table[i] = (cnt_high << 8) | cnt_low; + } + for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++) + { + for (u32 j = 0; j < 4; j++) fsm_table[i].m128i_u32[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++) + { + for (u32 j = 0; j < 8; j++) fsmh_table[i].m128i_u16[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++) + { + for (u32 j = 0; j < 16; j++) fsmb_table[i].m128i_u8[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) sldq_pshufb[i].m128i_u8[j] = (u8)(j - i); + } + for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) srdq_pshufb[i].m128i_u8[j] = (j + i > 15) ? 0xff : (u8)(j + i); + } + for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) rldq_pshufb[i].m128i_u8[j] = (u8)(j - i) & 0xf; + } + } +}; + class SPURecompiler; class SPURecompilerCore : public CPUDecoder @@ -57,6 +109,9 @@ public: #define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2") #define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1") +#define g_imm_xmm(x) oword_ptr(*g_imm_var, offsetof(g_imm_table_struct, x)) +#define g_imm2_xmm(x, y) oword_ptr(*g_imm_var, y, 0, offsetof(g_imm_table_struct, x)) + #define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__) #define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__) @@ -97,12 +152,14 @@ public: GpVar* cpu_var; GpVar* ls_var; GpVar* imm_var; - // (input) output: + GpVar* g_imm_var; + // output: GpVar* pos_var; // temporary: GpVar* addr; GpVar* qw0; GpVar* qw1; + GpVar* qw2; struct XmmLink { @@ -578,30 +635,41 @@ private: } void ROT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x1f)) | (CPU.GPR[ra]._u32[0] >> (32 - (CPU.GPR[rb]._u32[0] & 0x1f))); - CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x1f)) | (CPU.GPR[ra]._u32[1] >> (32 - (CPU.GPR[rb]._u32[1] & 0x1f))); - CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x1f)) | (CPU.GPR[ra]._u32[2] >> (32 - (CPU.GPR[rb]._u32[2] & 0x1f))); - CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x1f)) | (CPU.GPR[ra]._u32[3] >> (32 - (CPU.GPR[rb]._u32[3] & 0x1f))); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.rol(qw0->r32(), *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void ROTM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : 0; - CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : 0; - CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : 0; - CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : 0; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.neg(*addr); + c.shr(*qw0, *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void ROTMA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : CPU.GPR[ra]._i32[0] >> 31; - CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : CPU.GPR[ra]._i32[1] >> 31; - CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : CPU.GPR[ra]._i32[2] >> 31; - CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : CPU.GPR[ra]._i32[3] >> 31; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.movsxd(*qw0, cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.neg(*addr); + c.sar(*qw0, *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void SHL(u32 rt, u32 ra, u32 rb) { @@ -617,31 +685,53 @@ private: } void ROTH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0xf)) | (CPU.GPR[ra]._u16[h] >> (16 - (CPU.GPR[rb]._u16[h] & 0xf))); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.rol(qw0->r16(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTHM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : 0; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.neg(*addr); + c.shr(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTMAH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : CPU.GPR[ra]._i16[h] >> 15; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movsx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.neg(*addr); + c.sar(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void SHLH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[rb]._u16[h] & 0x1f) > 15 ? 0 : CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0x1f); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.shl(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTI(u32 rt, u32 ra, s32 i7) { @@ -1186,27 +1276,33 @@ private: } void FSM(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = (pref & (1 << w)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xf); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsm_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FSMH(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (pref & (1 << h)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xff); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsmh_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FSMB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = (pref & (1 << b)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xffff); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsmb_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FREST(u32 rt, u32 ra) { @@ -1247,32 +1343,35 @@ private: } void ROTQBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0xf; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0xf << 3); + c.shl(*addr, 1); + c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ROTQMBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (0 - (CPU.GPR[rb]._u32[3] >> 3)) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.shr(*addr, 3); + c.neg(*addr); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void SHLQBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = s; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0x1f << 3); + c.shl(*addr, 1); + c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CBX(u32 rt, u32 ra, u32 rb) { @@ -1361,73 +1460,89 @@ private: } void ROTQBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = CPU.GPR[rb]._u32[3] & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << t) | (temp._u32[3] >> (32 - t)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*qw2, *qw0); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 7); + c.shld(*qw0, *qw1, *addr); + c.shld(*qw1, *qw2, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQMBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = (0 - CPU.GPR[rb]._u32[3]) & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] >> t) | (temp._u32[1] << (32 - t)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] >> t) | (temp._u32[2] << (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] >> t) | (temp._u32[3] << (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] >> t); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.neg(*addr); + c.and_(*addr, 7); + c.shrd(*qw0, *qw1, *addr); + c.shr(*qw1, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void SHLQBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = CPU.GPR[rb]._u32[3] & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << t); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 7); + c.shld(*qw1, *qw0, *addr); + c.shl(*qw0, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = CPU.GPR[rb]._u32[3] & 0xf; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - for (int b = 0; b < 16; ++b) - CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0xf); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ROTQMBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (0 - CPU.GPR[rb]._u32[3]) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.neg(*addr); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void SHLQBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = CPU.GPR[rb]._u32[3] & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = s; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ORX(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[0] | CPU.GPR[ra]._u32[1] | CPU.GPR[ra]._u32[2] | CPU.GPR[ra]._u32[3]; - CPU.GPR[rt]._u32[2] = 0; - CPU.GPR[rt]._u64[0] = 0; - WRAPPER_END(rt, ra, 0, 0); + XmmInvalidate(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[0])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[1])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[2])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[3])); + c.mov(cpu_dword(GPR[rt]._u32[3]), *addr); + c.xor_(*addr, *addr); + c.mov(cpu_dword(GPR[rt]._u32[0]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[1]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[2]), *addr); + LOG_OPCODE(); } void CBD(u32 rt, u32 ra, s32 i7) { @@ -1488,36 +1603,37 @@ private: } void ROTQBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << s) | (temp._u32[3] >> (32 - s)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*qw2, *qw0); + c.shld(*qw0, *qw1, i7 & 0x7); + c.shld(*qw1, *qw2, i7 & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQMBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = (0 - (s32)i7) & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] >> s) | (temp._u32[1] << (32 - s)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] >> s) | (temp._u32[2] << (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] >> s) | (temp._u32[3] << (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] >> s); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.shrd(*qw0, *qw1, (0 - i7) & 0x7); + c.shr(*qw1, (0 - i7) & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void SHLQBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << s); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.shld(*qw1, *qw0, i7 & 0x7); + c.shl(*qw0, i7 & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQBYI(u32 rt, u32 ra, s32 i7) { @@ -1729,7 +1845,7 @@ private: } void SUMB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int w = 0; w < 4; w++) @@ -1737,7 +1853,46 @@ private: CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3]; CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3]; } - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + const XmmLink& va = XmmGet(ra); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + const XmmLink& v1 = XmmCopy(vb, rt); + const XmmLink& v2 = XmmCopy(vb); + const XmmLink& vFF = XmmAlloc(); + c.movdqa(vFF.get(), XmmConst(_mm_set1_epi32(0xff))); + c.pand(v1.get(), vFF.get()); + c.psrld(v2.get(), 8); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), vb.get()); + c.psrld(v2.get(), 16); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), vb.get()); + c.psrld(v2.get(), 24); + c.paddd(v1.get(), v2.get()); + c.pslld(v1.get(), 16); + c.movdqa(v2.get(), va.get()); + c.pand(v2.get(), vFF.get()); + c.por(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 8); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 16); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 24); + c.paddd(v1.get(), v2.get()); + XmmFinalize(vb); + XmmFinalize(va); + XmmFinalize(v1, rt); + XmmFinalize(v2); + XmmFinalize(vFF); + LOG_OPCODE(); } //HGT uses signed values. HLGT uses unsigned values void HGT(u32 rt, s32 ra, s32 rb) @@ -1754,18 +1909,16 @@ private: } void CLZ(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - for (int w = 0; w < 4; w++) + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) { - int nPos; - - for (nPos = 0; nPos < 32; nPos++) - if (CPU.GPR[ra]._u32[w] & (1 << (31 - nPos))) - break; - - CPU.GPR[rt]._u32[w] = nPos; + c.bsr(*addr, cpu_dword(GPR[ra]._u32[i])); + c.cmovz(*addr, dword_ptr(*g_imm_var, offsetof(g_imm_table_struct, fsmb_table[0xffff]))); // load 0xffffffff + c.neg(*addr); + c.add(*addr, 31); + c.mov(cpu_dword(GPR[rt]._u32[i]), *addr); } - WRAPPER_END(rt, ra, 0, 0); + LOG_OPCODE(); } void XSWD(u32 rt, u32 ra) { @@ -1786,13 +1939,14 @@ private: } void CNTB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16; b++) - for (int i = 0; i < 8; i++) - CPU.GPR[rt]._u8[b] += (temp._u8[b] & (1 << i)) ? 1 : 0; - WRAPPER_END(rt, ra, 0, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(*addr, cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, word_ptr(*g_imm_var, *addr, 1, offsetof(g_imm_table_struct, cntb_table[0]))); + c.mov(cpu_word(GPR[rt]._u16[i]), addr->r16()); + } + LOG_OPCODE(); } void XSBH(u32 rt, u32 ra) { @@ -2228,14 +2382,14 @@ private: XmmFinalize(vt); LOG_OPCODE(); } - void CGX(u32 rt, u32 ra, u32 rb) + void CGX(u32 rt, u32 ra, u32 rb) //nf { WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32; WRAPPER_END(rt, ra, rb, 0); } - void BGX(u32 rt, u32 ra, u32 rb) + void BGX(u32 rt, u32 ra, u32 rb) //nf { WRAPPER_BEGIN(rt, ra, rb, zz); s64 nResult; @@ -2299,7 +2453,7 @@ private: { UNIMPLEMENTED(); } - void DFTSV(u32 rt, u32 ra, s32 i7) + void DFTSV(u32 rt, u32 ra, s32 i7) //nf { WRAPPER_BEGIN(rt, ra, i7, zz); const u64 DoubleExpMask = 0x7ff0000000000000; @@ -2721,12 +2875,7 @@ private: else { const XmmLink& vr = XmmAlloc(rt); - __m128i fsmbi_mask; - for (u32 j = 0; j < 16; j++) - { - fsmbi_mask.m128i_i8[j] = ((i16 >> j) & 0x1) ? 0xff : 0; - } - c.movdqa(vr.get(), XmmConst(fsmbi_mask)); + c.movdqa(vr.get(), g_imm_xmm(fsmb_table[i16 & 0xffff])); XmmFinalize(vr, rt); } LOG_OPCODE(); diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index 5a6cd5e880..56d7e65d38 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -4,6 +4,8 @@ #include "SPUInterpreter.h" #include "SPURecompiler.h" +static const g_imm_table_struct g_imm_table; + SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) , inter(new SPUInterpreter(cpu)) @@ -58,16 +60,21 @@ void SPURecompilerCore::Compile(u16 pos) compiler.alloc(imm_var); m_enc->imm_var = &imm_var; - GpVar pos_var(compiler, kVarTypeUInt32, "pos"); - compiler.setArg(3, pos_var); - m_enc->pos_var = &pos_var; + GpVar g_imm_var(compiler, kVarTypeIntPtr, "g_imm"); + compiler.setArg(3, g_imm_var); + compiler.alloc(g_imm_var); + m_enc->g_imm_var = &g_imm_var; + GpVar pos_var(compiler, kVarTypeUInt32, "pos"); + m_enc->pos_var = &pos_var; GpVar addr_var(compiler, kVarTypeUInt32, "addr"); m_enc->addr = &addr_var; GpVar qw0_var(compiler, kVarTypeUInt64, "qw0"); m_enc->qw0 = &qw0_var; GpVar qw1_var(compiler, kVarTypeUInt64, "qw1"); m_enc->qw1 = &qw1_var; + GpVar qw2_var(compiler, kVarTypeUInt64, "qw2"); + m_enc->qw2 = &qw2_var; for (u32 i = 0; i < 16; i++) { @@ -198,7 +205,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) return 0; } - typedef u32(*Func)(void* _cpu, void* _ls, const void* _imm, u32 _pos); + typedef u32(*Func)(const void* _cpu, const void* _ls, const void* _imm, const void* _g_imm); Func func = asmjit_cast(entry[pos].pointer); @@ -215,7 +222,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) } u32 res = pos; - res = func(cpu, &Memory[m_offset], imm_table.data(), res); + res = func(cpu, &Memory[m_offset], imm_table.data(), &g_imm_table); if (res > 0xffff) {