From 237ab974dd026638fe4ad886f87e5b4249dedacd Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 7 Oct 2014 17:35:44 +0400 Subject: [PATCH] SPURS kernel v1 --- Utilities/BEType.h | 90 +++++++++++-- Utilities/GNU.h | 26 ++++ rpcs3/Emu/Cell/SPURecompiler.h | 58 -------- rpcs3/Emu/Cell/SPUThread.cpp | 2 +- rpcs3/Emu/Cell/SPUThread.h | 62 ++++++++- rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp | 164 +++++++++++++++++++++-- 6 files changed, 323 insertions(+), 79 deletions(-) diff --git a/Utilities/BEType.h b/Utilities/BEType.h index 29848bc4ae..12bb176531 100644 --- a/Utilities/BEType.h +++ b/Utilities/BEType.h @@ -1,20 +1,92 @@ #pragma once -#include - union u128 { + __m128 vf; + __m128i vi; + u64 _u64[2]; s64 _s64[2]; + + class u64_reversed_array_2 + { + u64 data[2]; + + public: + u64& operator [] (s32 index) + { + return data[1 - index]; + } + + const u64& operator [] (s32 index) const + { + return data[1 - index]; + } + + } u64r; + u32 _u32[4]; s32 _s32[4]; + + class u32_reversed_array_4 + { + u32 data[4]; + + public: + u32& operator [] (s32 index) + { + return data[3 - index]; + } + + const u32& operator [] (s32 index) const + { + return data[3 - index]; + } + + } u32r; + u16 _u16[8]; s16 _s16[8]; + + class u16_reversed_array_8 + { + u16 data[8]; + + public: + u16& operator [] (s32 index) + { + return data[7 - index]; + } + + const u16& operator [] (s32 index) const + { + return data[7 - index]; + } + + } u16r; + u8 _u8[16]; s8 _s8[16]; + + class u8_reversed_array_16 + { + u8 data[16]; + + public: + u8& operator [] (s32 index) + { + return data[15 - index]; + } + + const u8& operator [] (s32 index) const + { + return data[15 - index]; + } + + } u8r; + float _f[4]; double _d[2]; - __m128 xmm; class bit_array_128 { @@ -94,6 +166,11 @@ union u128 return ret; } + static u128 from64r(u64 _1, u64 _0 = 0) + { + return from64(_0, _1); + } + static u128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0) { u128 ret; @@ -106,12 +183,7 @@ union u128 static u128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0) { - u128 ret; - ret._u32[0] = _0; - ret._u32[1] = _1; - ret._u32[2] = _2; - ret._u32[3] = _3; - return ret; + return from32(_0, _1, _2, _3); } static u128 fromBit(u32 bit) diff --git a/Utilities/GNU.h b/Utilities/GNU.h index b1bcbfb982..a56df8db94 100644 --- a/Utilities/GNU.h +++ b/Utilities/GNU.h @@ -1,5 +1,7 @@ #pragma once +#include + #ifdef _WIN32 #define thread_local __declspec(thread) #elif __APPLE__ @@ -222,3 +224,27 @@ static __forceinline uint64_t cntlz64(uint64_t arg) } #endif } + +static __forceinline __m128i operator & (__m128i A, __m128i B) +{ + return _mm_and_si128(A, B); +} + +static __forceinline __m128i operator | (__m128i A, __m128i B) +{ + return _mm_or_si128(A, B); +} + +// compare 16 packed unsigned byte values (greater than) +static __forceinline __m128i _mm_cmpgt_epu8(__m128i A, __m128i B) +{ + // (A xor 0x80) > (B xor 0x80) + return _mm_cmpgt_epi8(_mm_xor_si128(A, _mm_set1_epi8(-128)), _mm_xor_si128(B, _mm_set1_epi8(-128))); +} + +// compare 16 packed unsigned byte values (less or equal) +static __forceinline __m128i _mm_cmple_epu8(__m128i A, __m128i B) +{ + // ((B xor 0x80) > (A xor 0x80)) || A == B + return _mm_cmpgt_epu8(B, A) | _mm_cmpeq_epi8(A, B); +} diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index dafae1842b..12930680a9 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -9,64 +9,6 @@ using namespace asmjit::host; #define UNIMPLEMENTED() UNK(__FUNCTION__) -#define mmToU64Ptr(x) ((u64*)(&x)) -#define mmToU32Ptr(x) ((u32*)(&x)) -#define mmToU16Ptr(x) ((u16*)(&x)) -#define mmToU8Ptr(x) ((u8*)(&x)) - -struct g_imm_table_struct -{ - //u16 cntb_table[65536]; - - __m128i fsmb_table[65536]; - __m128i fsmh_table[256]; - __m128i fsm_table[16]; - - __m128i sldq_pshufb[32]; - __m128i srdq_pshufb[32]; - __m128i rldq_pshufb[16]; - - g_imm_table_struct() - { - /*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0"); - for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++) - { - u32 cnt_low = 0, cnt_high = 0; - for (u32 j = 0; j < 8; j++) - { - cnt_low += (i >> j) & 1; - cnt_high += (i >> (j + 8)) & 1; - } - cntb_table[i] = (cnt_high << 8) | cnt_low; - }*/ - for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++) - { - - for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0; - } - for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++) - { - for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0; - } - for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++) - { - for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0; - } - for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++) - { - for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i); - } - for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++) - { - for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i); - } - for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++) - { - for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf; - } - } -}; - class SPURecompiler; class SPURecompilerCore : public CPUDecoder diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 04bb14b88f..ba5f75d67e 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1018,7 +1018,7 @@ void SPUThread::StopAndSignal(u32 code) case 0x003: { - GPR[3]._u32[3] = m_code3_func(*this); + GPR[3]._u64[1] = m_code3_func(*this); break; } diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 8eef83d71f..5553d2fc28 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -105,6 +105,66 @@ enum SPU_RdSigNotify2_offs = 0x1C00C, }; +#define mmToU64Ptr(x) ((u64*)(&x)) +#define mmToU32Ptr(x) ((u32*)(&x)) +#define mmToU16Ptr(x) ((u16*)(&x)) +#define mmToU8Ptr(x) ((u8*)(&x)) + +struct g_imm_table_struct +{ + //u16 cntb_table[65536]; + + __m128i fsmb_table[65536]; + __m128i fsmh_table[256]; + __m128i fsm_table[16]; + + __m128i sldq_pshufb[32]; + __m128i srdq_pshufb[32]; + __m128i rldq_pshufb[16]; + + g_imm_table_struct() + { + /*static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0"); + for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++) + { + u32 cnt_low = 0, cnt_high = 0; + for (u32 j = 0; j < 8; j++) + { + cnt_low += (i >> j) & 1; + cnt_high += (i >> (j + 8)) & 1; + } + cntb_table[i] = (cnt_high << 8) | cnt_low; + }*/ + for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++) + { + + for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++) + { + for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++) + { + for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i); + } + for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i); + } + for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf; + } + } +}; + +extern const g_imm_table_struct g_imm_table; + //Floating point status and control register. Unsure if this is one of the GPRs or SPRs //Is 128 bits, but bits 0-19, 24-28, 32-49, 56-60, 64-81, 88-92, 96-115, 120-124 are unused class FPSCR @@ -451,7 +511,7 @@ public: void WriteLS128(const u32 lsa, const u128& data) const { vm::write128(lsa + m_offset, data); } std::function m_custom_task; - std::function m_code3_func; + std::function m_code3_func; public: SPUThread(CPUThreadType type = CPU_THREAD_SPU); diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp index ac46056b38..50c6842697 100644 --- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp +++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp @@ -170,13 +170,138 @@ s64 spursInit( SPU.GPR[4]._u64[1] = spurs.addr(); return SPU.FastCall(SPU.PC); #endif - //SPU.WriteLS32(0x808, 2); // hack for cellSpursModuleExit - //SPU.WriteLS32(0x260, 3); // hack for cellSpursModulePollStatus - //SPU.WriteLS32(0x264, 0x35000000); // bi $0 + SPU.WriteLS32(SPU.ReadLS32(0x1e0), 2); // hack for cellSpursModuleExit + + /*if (!isSecond)*/ SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // first variant + { + LV2_LOCK(0); + + const u32 arg1 = SPU.GPR[3]._u32[3]; + u32 var0 = SPU.ReadLS32(0x1d8); + u32 var1 = SPU.ReadLS32(0x1dc); + u128 wklA = vm::read128(spurs.addr() + 0x20); + u128 wklB = vm::read128(spurs.addr() + 0x30); + u128 savedA = SPU.ReadLS128(0x180); + u128 savedB = SPU.ReadLS128(0x190); + u128 vAA; vAA.vi = _mm_sub_epi32(wklA.vi, savedA.vi); + u128 vBB; vBB.vi = _mm_sub_epi32(wklB.vi, savedB.vi); + u128 vAABB; vAABB.vi = (arg1 == 0) ? _mm_add_epi32(vAA.vi, _mm_andnot_si128(g_imm_table.fsmb_table[0x8000 >> var1], vBB.vi)) : vAA.vi; + + u32 vNUM = 0x20; + u64 vRES = 0x20ull << 32; + u128 vSET = {}; + + if (spurs->m.x72.read_relaxed() & (1 << num)) + { + SPU.WriteLS8(0x1eb, 0); // var4 + if (arg1 && var1 != 0x20) + { + spurs->m.x72._and_not(1 << num); + } + } + else + { + u128 wklReadyCount0 = vm::read128(spurs.addr() + 0x0); + u128 wklReadyCount1 = vm::read128(spurs.addr() + 0x10); + u128 savedC = SPU.ReadLS128(0x1A0); + u128 savedD = SPU.ReadLS128(0x1B0); + u128 vRC; vRC.vi = _mm_add_epi32(_mm_min_epu8(wklReadyCount0.vi, _mm_set1_epi8(8)), _mm_min_epu8(wklReadyCount1.vi, _mm_set1_epi8(8))); + u32 wklFlag = spurs->m.wklFlag.flag.read_relaxed(); + u32 flagRecv = spurs->m.flagRecv.read_relaxed(); + u128 vFM; vFM.vi = g_imm_table.fsmb_table[wklFlag == 0 ? 0x8000 >> flagRecv : 0]; + u128 wklSet1; wklSet1.vi = g_imm_table.fsmb_table[spurs->m.wklSet1.read_relaxed()]; + u128 vFMS1; vFMS1.vi = vFM.vi | wklSet1.vi; + u128 vFMV1; vFMV1.vi = g_imm_table.fsmb_table[(wklFlag == 0 ? 0x8000 >> flagRecv : 0) >> var1]; + u32 var5 = SPU.ReadLS32(0x1ec); + u128 wklMinCnt = vm::read128(spurs.addr() + 0x40); + u128 wklMaxCnt = vm::read128(spurs.addr() + 0x50); + u128 vCC; vCC.vi = _mm_andnot_si128(vFMS1.vi, + _mm_cmpeq_epi8(wklReadyCount0.vi, _mm_set1_epi8(0)) | _mm_cmple_epu8(vRC.vi, vAABB.vi)) | + _mm_cmple_epu8(wklMaxCnt.vi, vAABB.vi) | + _mm_cmpeq_epi8(savedC.vi, _mm_set1_epi8(0)) | + g_imm_table.fsmb_table[(~var5) >> 16]; + u128 vCCH1; vCCH1.vi = _mm_andnot_si128(vCC.vi, + _mm_set1_epi8((char)0x80) & (vFMS1.vi | _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi)) | + _mm_set1_epi8(0x7f) & savedC.vi); + u128 vCCL1; vCCL1.vi = _mm_andnot_si128(vCC.vi, + _mm_set1_epi8((char)0x80) & vFMV1.vi | + _mm_set1_epi8(0x40) & _mm_cmpgt_epu8(vAABB.vi, _mm_set1_epi8(0)) & _mm_cmpgt_epu8(wklMinCnt.vi, vAABB.vi) | + _mm_set1_epi8(0x3c) & _mm_slli_epi32(_mm_sub_epi32(_mm_set1_epi8(8), vAABB.vi), 2) | + _mm_set1_epi8(0x02) & _mm_cmpeq_epi8(savedD.vi, _mm_set1_epi8((s8)var0)) | + _mm_set1_epi8(0x01)); + u128 vSTAT; vSTAT.vi = + _mm_set1_epi8(0x01) & _mm_cmpgt_epu8(wklReadyCount0.vi, vAABB.vi) | + _mm_set1_epi8(0x02) & wklSet1.vi | + _mm_set1_epi8(0x04) & vFM.vi; + + for (s32 i = 0, max = -1; i < 0x10; i++) + { + const s32 value = ((s32)vCCH1.u8r[i] << 8) | ((s32)vCCL1.u8r[i]); + if (value > max && (vCC.u8r[i] & 1) == 0) + { + vNUM = i; + max = value; + } + } + + if (vNUM < 0x10) + { + vRES == ((u64)vNUM << 32) | vSTAT.u8r[vNUM]; + vSET.u8r[vNUM] = 0x01; + } + + SPU.WriteLS8(0x1eb, vNUM == 0x20); + + if (!arg1 || var1 == vNUM) + { + spurs->m.wklSet1._and_not(be_t::make(0x8000 >> vNUM)); + } + + if (vNUM == flagRecv) + { + spurs->m.wklFlag.flag |= be_t::make(-1); + } + } + + if (arg1 == 0) + { + vAA.vi = _mm_add_epi32(vAA.vi, vSET.vi); + vm::write128(spurs.addr() + 0x20, vAA); // update wklA + + SPU.WriteLS128(0x180, vSET); // update savedA + SPU.WriteLS32(0x1dc, vNUM); // update var1 + } + + if (arg1 == 1 && vNUM != var1) + { + vBB.vi = _mm_add_epi32(vBB.vi, vSET.vi); + vm::write128(spurs.addr() + 0x30, vBB); // update wklB + + SPU.WriteLS128(0x190, vSET); // update savedB + } + else + { + SPU.WriteLS128(0x190, {}); // update savedB + } + + return vRES; + }; + //else SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // second variant + //{ + // + //}; + + if (SPU.m_code3_func) + { + const u32 addr = SPU.ReadLS32(0x1e4); + SPU.WriteLS32(addr + 0, 3); // hack for cellSpursModulePollStatus + SPU.WriteLS32(addr + 4, 0x35000000); // bi $0 + } SPU.WriteLS128(0x1c0, u128::from32r(0, spurs.addr(), num, 0x1f)); u32 wid = 0x20; + u32 stat = 0; while (true) { if (Emu.IsStopped()) @@ -199,11 +324,19 @@ s64 spursInit( if (!isSecond) SPU.WriteLS16(0x1e8, 0); // run workload: - SPU.GPR[1]._u32[3] = 0x3FFB0; - SPU.GPR[3]._u32[3] = 0x100; - SPU.GPR[4]._u64[1] = wkl.data; - SPU.GPR[5]._u32[3] = 0; - SPU.FastCall(0xa00); + if (wid <= 0x20) + { + SPU.GPR[1]._u32[3] = 0x3FFB0; + SPU.GPR[3]._u32[3] = 0x100; + SPU.GPR[4]._u64[1] = wkl.data; + SPU.GPR[5]._u32[3] = stat; + SPU.FastCall(0xa00); + } + else + { + // hack + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } // check status: auto status = SPU.SPU.Status.GetValue(); @@ -217,8 +350,19 @@ s64 spursInit( } // get workload id: - //SPU.GPR[3].clear(); - //wid = SPU.m_code3_func(SPU); + SPU.GPR[3].clear(); + if (SPU.m_code3_func) + { + u64 res = SPU.m_code3_func(SPU); + stat = (u32)(res); + wid = (u32)(res >> 32); + } + else + { + SPU.FastCall(0x290); + stat = SPU.GPR[3]._u32[2]; + wid = SPU.GPR[3]._u32[3]; + } } })->GetId();