mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-12-28 09:23:34 +00:00
SPURS kernel v1
This commit is contained in:
parent
67bc9acbe0
commit
237ab974dd
@ -1,20 +1,92 @@
|
||||
#pragma once
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
union u128
|
||||
{
|
||||
__m128 vf;
|
||||
__m128i vi;
|
||||
|
||||
u64 _u64[2];
|
||||
s64 _s64[2];
|
||||
|
||||
class u64_reversed_array_2
|
||||
{
|
||||
u64 data[2];
|
||||
|
||||
public:
|
||||
u64& operator [] (s32 index)
|
||||
{
|
||||
return data[1 - index];
|
||||
}
|
||||
|
||||
const u64& operator [] (s32 index) const
|
||||
{
|
||||
return data[1 - index];
|
||||
}
|
||||
|
||||
} u64r;
|
||||
|
||||
u32 _u32[4];
|
||||
s32 _s32[4];
|
||||
|
||||
class u32_reversed_array_4
|
||||
{
|
||||
u32 data[4];
|
||||
|
||||
public:
|
||||
u32& operator [] (s32 index)
|
||||
{
|
||||
return data[3 - index];
|
||||
}
|
||||
|
||||
const u32& operator [] (s32 index) const
|
||||
{
|
||||
return data[3 - index];
|
||||
}
|
||||
|
||||
} u32r;
|
||||
|
||||
u16 _u16[8];
|
||||
s16 _s16[8];
|
||||
|
||||
class u16_reversed_array_8
|
||||
{
|
||||
u16 data[8];
|
||||
|
||||
public:
|
||||
u16& operator [] (s32 index)
|
||||
{
|
||||
return data[7 - index];
|
||||
}
|
||||
|
||||
const u16& operator [] (s32 index) const
|
||||
{
|
||||
return data[7 - index];
|
||||
}
|
||||
|
||||
} u16r;
|
||||
|
||||
u8 _u8[16];
|
||||
s8 _s8[16];
|
||||
|
||||
class u8_reversed_array_16
|
||||
{
|
||||
u8 data[16];
|
||||
|
||||
public:
|
||||
u8& operator [] (s32 index)
|
||||
{
|
||||
return data[15 - index];
|
||||
}
|
||||
|
||||
const u8& operator [] (s32 index) const
|
||||
{
|
||||
return data[15 - index];
|
||||
}
|
||||
|
||||
} u8r;
|
||||
|
||||
float _f[4];
|
||||
double _d[2];
|
||||
__m128 xmm;
|
||||
|
||||
class bit_array_128
|
||||
{
|
||||
@ -94,6 +166,11 @@ union u128
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u128 from64r(u64 _1, u64 _0 = 0)
|
||||
{
|
||||
return from64(_0, _1);
|
||||
}
|
||||
|
||||
static u128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0)
|
||||
{
|
||||
u128 ret;
|
||||
@ -106,12 +183,7 @@ union u128
|
||||
|
||||
static u128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0)
|
||||
{
|
||||
u128 ret;
|
||||
ret._u32[0] = _0;
|
||||
ret._u32[1] = _1;
|
||||
ret._u32[2] = _2;
|
||||
ret._u32[3] = _3;
|
||||
return ret;
|
||||
return from32(_0, _1, _2, _3);
|
||||
}
|
||||
|
||||
static u128 fromBit(u32 bit)
|
||||
|
@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define thread_local __declspec(thread)
|
||||
#elif __APPLE__
|
||||
@ -222,3 +224,27 @@ static __forceinline uint64_t cntlz64(uint64_t arg)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static __forceinline __m128i operator & (__m128i A, __m128i B)
|
||||
{
|
||||
return _mm_and_si128(A, B);
|
||||
}
|
||||
|
||||
static __forceinline __m128i operator | (__m128i A, __m128i B)
|
||||
{
|
||||
return _mm_or_si128(A, B);
|
||||
}
|
||||
|
||||
// compare 16 packed unsigned byte values (greater than)
|
||||
static __forceinline __m128i _mm_cmpgt_epu8(__m128i A, __m128i B)
|
||||
{
|
||||
// (A xor 0x80) > (B xor 0x80)
|
||||
return _mm_cmpgt_epi8(_mm_xor_si128(A, _mm_set1_epi8(-128)), _mm_xor_si128(B, _mm_set1_epi8(-128)));
|
||||
}
|
||||
|
||||
// compare 16 packed unsigned byte values (less or equal)
|
||||
static __forceinline __m128i _mm_cmple_epu8(__m128i A, __m128i B)
|
||||
{
|
||||
// ((B xor 0x80) > (A xor 0x80)) || A == B
|
||||
return _mm_cmpgt_epu8(B, A) | _mm_cmpeq_epi8(A, B);
|
||||
}
|
||||
|
@ -9,64 +9,6 @@ using namespace asmjit::host;
|
||||
|
||||
#define UNIMPLEMENTED() UNK(__FUNCTION__)
|
||||
|
||||
#define mmToU64Ptr(x) ((u64*)(&x))
|
||||
#define mmToU32Ptr(x) ((u32*)(&x))
|
||||
#define mmToU16Ptr(x) ((u16*)(&x))
|
||||
#define mmToU8Ptr(x) ((u8*)(&x))
|
||||
|
||||
struct g_imm_table_struct
|
||||
{
|
||||
//u16 cntb_table[65536];
|
||||
|
||||
__m128i fsmb_table[65536];
|
||||
__m128i fsmh_table[256];
|
||||
__m128i fsm_table[16];
|
||||
|
||||
__m128i sldq_pshufb[32];
|
||||
__m128i srdq_pshufb[32];
|
||||
__m128i rldq_pshufb[16];
|
||||
|
||||
g_imm_table_struct()
|
||||
{
|
||||
/*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0");
|
||||
for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++)
|
||||
{
|
||||
u32 cnt_low = 0, cnt_high = 0;
|
||||
for (u32 j = 0; j < 8; j++)
|
||||
{
|
||||
cnt_low += (i >> j) & 1;
|
||||
cnt_high += (i >> (j + 8)) & 1;
|
||||
}
|
||||
cntb_table[i] = (cnt_high << 8) | cnt_low;
|
||||
}*/
|
||||
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
|
||||
{
|
||||
|
||||
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SPURecompiler;
|
||||
|
||||
class SPURecompilerCore : public CPUDecoder
|
||||
|
@ -1018,7 +1018,7 @@ void SPUThread::StopAndSignal(u32 code)
|
||||
|
||||
case 0x003:
|
||||
{
|
||||
GPR[3]._u32[3] = m_code3_func(*this);
|
||||
GPR[3]._u64[1] = m_code3_func(*this);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -105,6 +105,66 @@ enum
|
||||
SPU_RdSigNotify2_offs = 0x1C00C,
|
||||
};
|
||||
|
||||
#define mmToU64Ptr(x) ((u64*)(&x))
|
||||
#define mmToU32Ptr(x) ((u32*)(&x))
|
||||
#define mmToU16Ptr(x) ((u16*)(&x))
|
||||
#define mmToU8Ptr(x) ((u8*)(&x))
|
||||
|
||||
struct g_imm_table_struct
|
||||
{
|
||||
//u16 cntb_table[65536];
|
||||
|
||||
__m128i fsmb_table[65536];
|
||||
__m128i fsmh_table[256];
|
||||
__m128i fsm_table[16];
|
||||
|
||||
__m128i sldq_pshufb[32];
|
||||
__m128i srdq_pshufb[32];
|
||||
__m128i rldq_pshufb[16];
|
||||
|
||||
g_imm_table_struct()
|
||||
{
|
||||
/*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0");
|
||||
for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++)
|
||||
{
|
||||
u32 cnt_low = 0, cnt_high = 0;
|
||||
for (u32 j = 0; j < 8; j++)
|
||||
{
|
||||
cnt_low += (i >> j) & 1;
|
||||
cnt_high += (i >> (j + 8)) & 1;
|
||||
}
|
||||
cntb_table[i] = (cnt_high << 8) | cnt_low;
|
||||
}*/
|
||||
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
|
||||
{
|
||||
|
||||
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
extern const g_imm_table_struct g_imm_table;
|
||||
|
||||
//Floating point status and control register. Unsure if this is one of the GPRs or SPRs
|
||||
//Is 128 bits, but bits 0-19, 24-28, 32-49, 56-60, 64-81, 88-92, 96-115, 120-124 are unused
|
||||
class FPSCR
|
||||
@ -451,7 +511,7 @@ public:
|
||||
void WriteLS128(const u32 lsa, const u128& data) const { vm::write128(lsa + m_offset, data); }
|
||||
|
||||
std::function<void(SPUThread& SPU)> m_custom_task;
|
||||
std::function<u32(SPUThread& SPU)> m_code3_func;
|
||||
std::function<u64(SPUThread& SPU)> m_code3_func;
|
||||
|
||||
public:
|
||||
SPUThread(CPUThreadType type = CPU_THREAD_SPU);
|
||||
|
@ -170,13 +170,138 @@ s64 spursInit(
|
||||
SPU.GPR[4]._u64[1] = spurs.addr();
|
||||
return SPU.FastCall(SPU.PC);
|
||||
#endif
|
||||
//SPU.WriteLS32(0x808, 2); // hack for cellSpursModuleExit
|
||||
//SPU.WriteLS32(0x260, 3); // hack for cellSpursModulePollStatus
|
||||
//SPU.WriteLS32(0x264, 0x35000000); // bi $0
|
||||
SPU.WriteLS32(SPU.ReadLS32(0x1e0), 2); // hack for cellSpursModuleExit
|
||||
|
||||
/*if (!isSecond)*/ SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // first variant
|
||||
{
|
||||
LV2_LOCK(0);
|
||||
|
||||
const u32 arg1 = SPU.GPR[3]._u32[3];
|
||||
u32 var0 = SPU.ReadLS32(0x1d8);
|
||||
u32 var1 = SPU.ReadLS32(0x1dc);
|
||||
u128 wklA = vm::read128(spurs.addr() + 0x20);
|
||||
u128 wklB = vm::read128(spurs.addr() + 0x30);
|
||||
u128 savedA = SPU.ReadLS128(0x180);
|
||||
u128 savedB = SPU.ReadLS128(0x190);
|
||||
u128 vAA; vAA.vi = _mm_sub_epi32(wklA.vi, savedA.vi);
|
||||
u128 vBB; vBB.vi = _mm_sub_epi32(wklB.vi, savedB.vi);
|
||||
u128 vAABB; vAABB.vi = (arg1 == 0) ? _mm_add_epi32(vAA.vi, _mm_andnot_si128(g_imm_table.fsmb_table[0x8000 >> var1], vBB.vi)) : vAA.vi;
|
||||
|
||||
u32 vNUM = 0x20;
|
||||
u64 vRES = 0x20ull << 32;
|
||||
u128 vSET = {};
|
||||
|
||||
if (spurs->m.x72.read_relaxed() & (1 << num))
|
||||
{
|
||||
SPU.WriteLS8(0x1eb, 0); // var4
|
||||
if (arg1 && var1 != 0x20)
|
||||
{
|
||||
spurs->m.x72._and_not(1 << num);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
u128 wklReadyCount0 = vm::read128(spurs.addr() + 0x0);
|
||||
u128 wklReadyCount1 = vm::read128(spurs.addr() + 0x10);
|
||||
u128 savedC = SPU.ReadLS128(0x1A0);
|
||||
u128 savedD = SPU.ReadLS128(0x1B0);
|
||||
u128 vRC; vRC.vi = _mm_add_epi32(_mm_min_epu8(wklReadyCount0.vi, _mm_set1_epi8(8)), _mm_min_epu8(wklReadyCount1.vi, _mm_set1_epi8(8)));
|
||||
u32 wklFlag = spurs->m.wklFlag.flag.read_relaxed();
|
||||
u32 flagRecv = spurs->m.flagRecv.read_relaxed();
|
||||
u128 vFM; vFM.vi = g_imm_table.fsmb_table[wklFlag == 0 ? 0x8000 >> flagRecv : 0];
|
||||
u128 wklSet1; wklSet1.vi = g_imm_table.fsmb_table[spurs->m.wklSet1.read_relaxed()];
|
||||
u128 vFMS1; vFMS1.vi = vFM.vi | wklSet1.vi;
|
||||
u128 vFMV1; vFMV1.vi = g_imm_table.fsmb_table[(wklFlag == 0 ? 0x8000 >> flagRecv : 0) >> var1];
|
||||
u32 var5 = SPU.ReadLS32(0x1ec);
|
||||
u128 wklMinCnt = vm::read128(spurs.addr() + 0x40);
|
||||
u128 wklMaxCnt = vm::read128(spurs.addr() + 0x50);
|
||||
u128 vCC; vCC.vi = _mm_andnot_si128(vFMS1.vi,
|
||||
_mm_cmpeq_epi8(wklReadyCount0.vi, _mm_set1_epi8(0)) | _mm_cmple_epu8(vRC.vi, vAABB.vi)) |
|
||||
_mm_cmple_epu8(wklMaxCnt.vi, vAABB.vi) |
|
||||
_mm_cmpeq_epi8(savedC.vi, _mm_set1_epi8(0)) |
|
||||
g_imm_table.fsmb_table[(~var5) >> 16];
|
||||
u128 vCCH1; vCCH1.vi = _mm_andnot_si128(vCC.vi,
|
||||
_mm_set1_epi8((char)0x80) & (vFMS1.vi | _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi)) |
|
||||
_mm_set1_epi8(0x7f) & savedC.vi);
|
||||
u128 vCCL1; vCCL1.vi = _mm_andnot_si128(vCC.vi,
|
||||
_mm_set1_epi8((char)0x80) & vFMV1.vi |
|
||||
_mm_set1_epi8(0x40) & _mm_cmpgt_epu8(vAABB.vi, _mm_set1_epi8(0)) & _mm_cmpgt_epu8(wklMinCnt.vi, vAABB.vi) |
|
||||
_mm_set1_epi8(0x3c) & _mm_slli_epi32(_mm_sub_epi32(_mm_set1_epi8(8), vAABB.vi), 2) |
|
||||
_mm_set1_epi8(0x02) & _mm_cmpeq_epi8(savedD.vi, _mm_set1_epi8((s8)var0)) |
|
||||
_mm_set1_epi8(0x01));
|
||||
u128 vSTAT; vSTAT.vi =
|
||||
_mm_set1_epi8(0x01) & _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi) |
|
||||
_mm_set1_epi8(0x02) & wklSet1.vi |
|
||||
_mm_set1_epi8(0x04) & vFM.vi;
|
||||
|
||||
for (s32 i = 0, max = -1; i < 0x10; i++)
|
||||
{
|
||||
const s32 value = ((s32)vCCH1.u8r[i] << 8) | ((s32)vCCL1.u8r[i]);
|
||||
if (value > max && (vCC.u8r[i] & 1) == 0)
|
||||
{
|
||||
vNUM = i;
|
||||
max = value;
|
||||
}
|
||||
}
|
||||
|
||||
if (vNUM < 0x10)
|
||||
{
|
||||
vRES == ((u64)vNUM << 32) | vSTAT.u8r[vNUM];
|
||||
vSET.u8r[vNUM] = 0x01;
|
||||
}
|
||||
|
||||
SPU.WriteLS8(0x1eb, vNUM == 0x20);
|
||||
|
||||
if (!arg1 || var1 == vNUM)
|
||||
{
|
||||
spurs->m.wklSet1._and_not(be_t<u16>::make(0x8000 >> vNUM));
|
||||
}
|
||||
|
||||
if (vNUM == flagRecv)
|
||||
{
|
||||
spurs->m.wklFlag.flag |= be_t<u32>::make(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if (arg1 == 0)
|
||||
{
|
||||
vAA.vi = _mm_add_epi32(vAA.vi, vSET.vi);
|
||||
vm::write128(spurs.addr() + 0x20, vAA); // update wklA
|
||||
|
||||
SPU.WriteLS128(0x180, vSET); // update savedA
|
||||
SPU.WriteLS32(0x1dc, vNUM); // update var1
|
||||
}
|
||||
|
||||
if (arg1 == 1 && vNUM != var1)
|
||||
{
|
||||
vBB.vi = _mm_add_epi32(vBB.vi, vSET.vi);
|
||||
vm::write128(spurs.addr() + 0x30, vBB); // update wklB
|
||||
|
||||
SPU.WriteLS128(0x190, vSET); // update savedB
|
||||
}
|
||||
else
|
||||
{
|
||||
SPU.WriteLS128(0x190, {}); // update savedB
|
||||
}
|
||||
|
||||
return vRES;
|
||||
};
|
||||
//else SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // second variant
|
||||
//{
|
||||
//
|
||||
//};
|
||||
|
||||
if (SPU.m_code3_func)
|
||||
{
|
||||
const u32 addr = SPU.ReadLS32(0x1e4);
|
||||
SPU.WriteLS32(addr + 0, 3); // hack for cellSpursModulePollStatus
|
||||
SPU.WriteLS32(addr + 4, 0x35000000); // bi $0
|
||||
}
|
||||
|
||||
SPU.WriteLS128(0x1c0, u128::from32r(0, spurs.addr(), num, 0x1f));
|
||||
|
||||
u32 wid = 0x20;
|
||||
u32 stat = 0;
|
||||
while (true)
|
||||
{
|
||||
if (Emu.IsStopped())
|
||||
@ -199,11 +324,19 @@ s64 spursInit(
|
||||
if (!isSecond) SPU.WriteLS16(0x1e8, 0);
|
||||
|
||||
// run workload:
|
||||
SPU.GPR[1]._u32[3] = 0x3FFB0;
|
||||
SPU.GPR[3]._u32[3] = 0x100;
|
||||
SPU.GPR[4]._u64[1] = wkl.data;
|
||||
SPU.GPR[5]._u32[3] = 0;
|
||||
SPU.FastCall(0xa00);
|
||||
if (wid <= 0x20)
|
||||
{
|
||||
SPU.GPR[1]._u32[3] = 0x3FFB0;
|
||||
SPU.GPR[3]._u32[3] = 0x100;
|
||||
SPU.GPR[4]._u64[1] = wkl.data;
|
||||
SPU.GPR[5]._u32[3] = stat;
|
||||
SPU.FastCall(0xa00);
|
||||
}
|
||||
else
|
||||
{
|
||||
// hack
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
// check status:
|
||||
auto status = SPU.SPU.Status.GetValue();
|
||||
@ -217,8 +350,19 @@ s64 spursInit(
|
||||
}
|
||||
|
||||
// get workload id:
|
||||
//SPU.GPR[3].clear();
|
||||
//wid = SPU.m_code3_func(SPU);
|
||||
SPU.GPR[3].clear();
|
||||
if (SPU.m_code3_func)
|
||||
{
|
||||
u64 res = SPU.m_code3_func(SPU);
|
||||
stat = (u32)(res);
|
||||
wid = (u32)(res >> 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
SPU.FastCall(0x290);
|
||||
stat = SPU.GPR[3]._u32[2];
|
||||
wid = SPU.GPR[3]._u32[3];
|
||||
}
|
||||
}
|
||||
|
||||
})->GetId();
|
||||
|
Loading…
Reference in New Issue
Block a user