SPURS kernel v1

This commit is contained in:
Nekotekina 2014-10-07 17:35:44 +04:00
parent 67bc9acbe0
commit 237ab974dd
6 changed files with 323 additions and 79 deletions

View File

@ -1,20 +1,92 @@
#pragma once
#include <emmintrin.h>
union u128
{
__m128 vf;
__m128i vi;
u64 _u64[2];
s64 _s64[2];
class u64_reversed_array_2
{
u64 data[2];
public:
u64& operator [] (s32 index)
{
return data[1 - index];
}
const u64& operator [] (s32 index) const
{
return data[1 - index];
}
} u64r;
u32 _u32[4];
s32 _s32[4];
class u32_reversed_array_4
{
u32 data[4];
public:
u32& operator [] (s32 index)
{
return data[3 - index];
}
const u32& operator [] (s32 index) const
{
return data[3 - index];
}
} u32r;
u16 _u16[8];
s16 _s16[8];
class u16_reversed_array_8
{
u16 data[8];
public:
u16& operator [] (s32 index)
{
return data[7 - index];
}
const u16& operator [] (s32 index) const
{
return data[7 - index];
}
} u16r;
u8 _u8[16];
s8 _s8[16];
class u8_reversed_array_16
{
u8 data[16];
public:
u8& operator [] (s32 index)
{
return data[15 - index];
}
const u8& operator [] (s32 index) const
{
return data[15 - index];
}
} u8r;
float _f[4];
double _d[2];
__m128 xmm;
class bit_array_128
{
@ -94,6 +166,11 @@ union u128
return ret;
}
static u128 from64r(u64 _1, u64 _0 = 0)
{
return from64(_0, _1);
}
static u128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0)
{
u128 ret;
@ -106,12 +183,7 @@ union u128
static u128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0)
{
u128 ret;
ret._u32[0] = _0;
ret._u32[1] = _1;
ret._u32[2] = _2;
ret._u32[3] = _3;
return ret;
return from32(_0, _1, _2, _3);
}
static u128 fromBit(u32 bit)

View File

@ -1,5 +1,7 @@
#pragma once
#include <emmintrin.h>
#ifdef _WIN32
#define thread_local __declspec(thread)
#elif __APPLE__
@ -222,3 +224,27 @@ static __forceinline uint64_t cntlz64(uint64_t arg)
}
#endif
}
static __forceinline __m128i operator & (__m128i A, __m128i B)
{
return _mm_and_si128(A, B);
}
static __forceinline __m128i operator | (__m128i A, __m128i B)
{
return _mm_or_si128(A, B);
}
// compare 16 packed unsigned byte values (greater than)
static __forceinline __m128i _mm_cmpgt_epu8(__m128i A, __m128i B)
{
// (A xor 0x80) > (B xor 0x80)
return _mm_cmpgt_epi8(_mm_xor_si128(A, _mm_set1_epi8(-128)), _mm_xor_si128(B, _mm_set1_epi8(-128)));
}
// compare 16 packed unsigned byte values (less or equal)
static __forceinline __m128i _mm_cmple_epu8(__m128i A, __m128i B)
{
// ((B xor 0x80) > (A xor 0x80)) || A == B
return _mm_cmpgt_epu8(B, A) | _mm_cmpeq_epi8(A, B);
}

View File

@ -9,64 +9,6 @@ using namespace asmjit::host;
#define UNIMPLEMENTED() UNK(__FUNCTION__)
#define mmToU64Ptr(x) ((u64*)(&x))
#define mmToU32Ptr(x) ((u32*)(&x))
#define mmToU16Ptr(x) ((u16*)(&x))
#define mmToU8Ptr(x) ((u8*)(&x))
struct g_imm_table_struct
{
//u16 cntb_table[65536];
__m128i fsmb_table[65536];
__m128i fsmh_table[256];
__m128i fsm_table[16];
__m128i sldq_pshufb[32];
__m128i srdq_pshufb[32];
__m128i rldq_pshufb[16];
g_imm_table_struct()
{
/*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0");
for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++)
{
u32 cnt_low = 0, cnt_high = 0;
for (u32 j = 0; j < 8; j++)
{
cnt_low += (i >> j) & 1;
cnt_high += (i >> (j + 8)) & 1;
}
cntb_table[i] = (cnt_high << 8) | cnt_low;
}*/
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
{
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
{
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
}
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
}
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
}
}
};
class SPURecompiler;
class SPURecompilerCore : public CPUDecoder

View File

@ -1018,7 +1018,7 @@ void SPUThread::StopAndSignal(u32 code)
case 0x003:
{
GPR[3]._u32[3] = m_code3_func(*this);
GPR[3]._u64[1] = m_code3_func(*this);
break;
}

View File

@ -105,6 +105,66 @@ enum
SPU_RdSigNotify2_offs = 0x1C00C,
};
#define mmToU64Ptr(x) ((u64*)(&x))
#define mmToU32Ptr(x) ((u32*)(&x))
#define mmToU16Ptr(x) ((u16*)(&x))
#define mmToU8Ptr(x) ((u8*)(&x))
struct g_imm_table_struct
{
//u16 cntb_table[65536];
__m128i fsmb_table[65536];
__m128i fsmh_table[256];
__m128i fsm_table[16];
__m128i sldq_pshufb[32];
__m128i srdq_pshufb[32];
__m128i rldq_pshufb[16];
g_imm_table_struct()
{
/*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0");
for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++)
{
u32 cnt_low = 0, cnt_high = 0;
for (u32 j = 0; j < 8; j++)
{
cnt_low += (i >> j) & 1;
cnt_high += (i >> (j + 8)) & 1;
}
cntb_table[i] = (cnt_high << 8) | cnt_low;
}*/
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
{
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
{
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
}
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
}
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
}
}
};
extern const g_imm_table_struct g_imm_table;
//Floating point status and control register. Unsure if this is one of the GPRs or SPRs
//Is 128 bits, but bits 0-19, 24-28, 32-49, 56-60, 64-81, 88-92, 96-115, 120-124 are unused
class FPSCR
@ -451,7 +511,7 @@ public:
void WriteLS128(const u32 lsa, const u128& data) const { vm::write128(lsa + m_offset, data); }
std::function<void(SPUThread& SPU)> m_custom_task;
std::function<u32(SPUThread& SPU)> m_code3_func;
std::function<u64(SPUThread& SPU)> m_code3_func;
public:
SPUThread(CPUThreadType type = CPU_THREAD_SPU);

View File

@ -170,13 +170,138 @@ s64 spursInit(
SPU.GPR[4]._u64[1] = spurs.addr();
return SPU.FastCall(SPU.PC);
#endif
//SPU.WriteLS32(0x808, 2); // hack for cellSpursModuleExit
//SPU.WriteLS32(0x260, 3); // hack for cellSpursModulePollStatus
//SPU.WriteLS32(0x264, 0x35000000); // bi $0
SPU.WriteLS32(SPU.ReadLS32(0x1e0), 2); // hack for cellSpursModuleExit
/*if (!isSecond)*/ SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // first variant
{
LV2_LOCK(0);
const u32 arg1 = SPU.GPR[3]._u32[3];
u32 var0 = SPU.ReadLS32(0x1d8);
u32 var1 = SPU.ReadLS32(0x1dc);
u128 wklA = vm::read128(spurs.addr() + 0x20);
u128 wklB = vm::read128(spurs.addr() + 0x30);
u128 savedA = SPU.ReadLS128(0x180);
u128 savedB = SPU.ReadLS128(0x190);
u128 vAA; vAA.vi = _mm_sub_epi32(wklA.vi, savedA.vi);
u128 vBB; vBB.vi = _mm_sub_epi32(wklB.vi, savedB.vi);
u128 vAABB; vAABB.vi = (arg1 == 0) ? _mm_add_epi32(vAA.vi, _mm_andnot_si128(g_imm_table.fsmb_table[0x8000 >> var1], vBB.vi)) : vAA.vi;
u32 vNUM = 0x20;
u64 vRES = 0x20ull << 32;
u128 vSET = {};
if (spurs->m.x72.read_relaxed() & (1 << num))
{
SPU.WriteLS8(0x1eb, 0); // var4
if (arg1 && var1 != 0x20)
{
spurs->m.x72._and_not(1 << num);
}
}
else
{
u128 wklReadyCount0 = vm::read128(spurs.addr() + 0x0);
u128 wklReadyCount1 = vm::read128(spurs.addr() + 0x10);
u128 savedC = SPU.ReadLS128(0x1A0);
u128 savedD = SPU.ReadLS128(0x1B0);
u128 vRC; vRC.vi = _mm_add_epi32(_mm_min_epu8(wklReadyCount0.vi, _mm_set1_epi8(8)), _mm_min_epu8(wklReadyCount1.vi, _mm_set1_epi8(8)));
u32 wklFlag = spurs->m.wklFlag.flag.read_relaxed();
u32 flagRecv = spurs->m.flagRecv.read_relaxed();
u128 vFM; vFM.vi = g_imm_table.fsmb_table[wklFlag == 0 ? 0x8000 >> flagRecv : 0];
u128 wklSet1; wklSet1.vi = g_imm_table.fsmb_table[spurs->m.wklSet1.read_relaxed()];
u128 vFMS1; vFMS1.vi = vFM.vi | wklSet1.vi;
u128 vFMV1; vFMV1.vi = g_imm_table.fsmb_table[(wklFlag == 0 ? 0x8000 >> flagRecv : 0) >> var1];
u32 var5 = SPU.ReadLS32(0x1ec);
u128 wklMinCnt = vm::read128(spurs.addr() + 0x40);
u128 wklMaxCnt = vm::read128(spurs.addr() + 0x50);
u128 vCC; vCC.vi = _mm_andnot_si128(vFMS1.vi,
_mm_cmpeq_epi8(wklReadyCount0.vi, _mm_set1_epi8(0)) | _mm_cmple_epu8(vRC.vi, vAABB.vi)) |
_mm_cmple_epu8(wklMaxCnt.vi, vAABB.vi) |
_mm_cmpeq_epi8(savedC.vi, _mm_set1_epi8(0)) |
g_imm_table.fsmb_table[(~var5) >> 16];
u128 vCCH1; vCCH1.vi = _mm_andnot_si128(vCC.vi,
_mm_set1_epi8((char)0x80) & (vFMS1.vi | _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi)) |
_mm_set1_epi8(0x7f) & savedC.vi);
u128 vCCL1; vCCL1.vi = _mm_andnot_si128(vCC.vi,
_mm_set1_epi8((char)0x80) & vFMV1.vi |
_mm_set1_epi8(0x40) & _mm_cmpgt_epu8(vAABB.vi, _mm_set1_epi8(0)) & _mm_cmpgt_epu8(wklMinCnt.vi, vAABB.vi) |
_mm_set1_epi8(0x3c) & _mm_slli_epi32(_mm_sub_epi32(_mm_set1_epi8(8), vAABB.vi), 2) |
_mm_set1_epi8(0x02) & _mm_cmpeq_epi8(savedD.vi, _mm_set1_epi8((s8)var0)) |
_mm_set1_epi8(0x01));
u128 vSTAT; vSTAT.vi =
_mm_set1_epi8(0x01) & _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi) |
_mm_set1_epi8(0x02) & wklSet1.vi |
_mm_set1_epi8(0x04) & vFM.vi;
for (s32 i = 0, max = -1; i < 0x10; i++)
{
const s32 value = ((s32)vCCH1.u8r[i] << 8) | ((s32)vCCL1.u8r[i]);
if (value > max && (vCC.u8r[i] & 1) == 0)
{
vNUM = i;
max = value;
}
}
if (vNUM < 0x10)
{
vRES == ((u64)vNUM << 32) | vSTAT.u8r[vNUM];
vSET.u8r[vNUM] = 0x01;
}
SPU.WriteLS8(0x1eb, vNUM == 0x20);
if (!arg1 || var1 == vNUM)
{
spurs->m.wklSet1._and_not(be_t<u16>::make(0x8000 >> vNUM));
}
if (vNUM == flagRecv)
{
spurs->m.wklFlag.flag |= be_t<u32>::make(-1);
}
}
if (arg1 == 0)
{
vAA.vi = _mm_add_epi32(vAA.vi, vSET.vi);
vm::write128(spurs.addr() + 0x20, vAA); // update wklA
SPU.WriteLS128(0x180, vSET); // update savedA
SPU.WriteLS32(0x1dc, vNUM); // update var1
}
if (arg1 == 1 && vNUM != var1)
{
vBB.vi = _mm_add_epi32(vBB.vi, vSET.vi);
vm::write128(spurs.addr() + 0x30, vBB); // update wklB
SPU.WriteLS128(0x190, vSET); // update savedB
}
else
{
SPU.WriteLS128(0x190, {}); // update savedB
}
return vRES;
};
//else SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // second variant
//{
//
//};
if (SPU.m_code3_func)
{
const u32 addr = SPU.ReadLS32(0x1e4);
SPU.WriteLS32(addr + 0, 3); // hack for cellSpursModulePollStatus
SPU.WriteLS32(addr + 4, 0x35000000); // bi $0
}
SPU.WriteLS128(0x1c0, u128::from32r(0, spurs.addr(), num, 0x1f));
u32 wid = 0x20;
u32 stat = 0;
while (true)
{
if (Emu.IsStopped())
@ -199,11 +324,19 @@ s64 spursInit(
if (!isSecond) SPU.WriteLS16(0x1e8, 0);
// run workload:
if (wid <= 0x20)
{
SPU.GPR[1]._u32[3] = 0x3FFB0;
SPU.GPR[3]._u32[3] = 0x100;
SPU.GPR[4]._u64[1] = wkl.data;
SPU.GPR[5]._u32[3] = 0;
SPU.GPR[5]._u32[3] = stat;
SPU.FastCall(0xa00);
}
else
{
// hack
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// check status:
auto status = SPU.SPU.Status.GetValue();
@ -217,8 +350,19 @@ s64 spursInit(
}
// get workload id:
//SPU.GPR[3].clear();
//wid = SPU.m_code3_func(SPU);
SPU.GPR[3].clear();
if (SPU.m_code3_func)
{
u64 res = SPU.m_code3_func(SPU);
stat = (u32)(res);
wid = (u32)(res >> 32);
}
else
{
SPU.FastCall(0x290);
stat = SPU.GPR[3]._u32[2];
wid = SPU.GPR[3]._u32[3];
}
}
})->GetId();