mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-17 08:11:51 +00:00
SPURecompiler improved
This commit is contained in:
parent
d2883fc554
commit
d1fbccc9ce
@ -18,26 +18,6 @@
|
||||
#define rotl32(x,r) (((u32)(x) << (r)) | ((u32)(x) >> (32 - (r))))
|
||||
#endif
|
||||
|
||||
class spu_scale_table_t
|
||||
{
|
||||
std::array<__m128, 155 + 174> m_data;
|
||||
|
||||
public:
|
||||
spu_scale_table_t()
|
||||
{
|
||||
for (s32 i = -155; i < 174; i++)
|
||||
{
|
||||
m_data[i + 155] = _mm_set1_ps(static_cast<float>(exp2(i)));
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline __m128 operator [] (s32 scale) const
|
||||
{
|
||||
return m_data[scale + 155];
|
||||
}
|
||||
}
|
||||
const g_spu_scale_table;
|
||||
|
||||
|
||||
void spu_interpreter::DEFAULT(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
@ -405,17 +385,17 @@ void spu_interpreter::GBB(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::FSM(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = g_imm_table.fsm_table[CPU.GPR[op.ra]._u32[3] & 0xf];
|
||||
CPU.GPR[op.rt] = g_spu_imm.fsm[CPU.GPR[op.ra]._u32[3] & 0xf];
|
||||
}
|
||||
|
||||
void spu_interpreter::FSMH(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = g_imm_table.fsmh_table[CPU.GPR[op.ra]._u32[3] & 0xff];
|
||||
CPU.GPR[op.rt] = g_spu_imm.fsmh[CPU.GPR[op.ra]._u32[3] & 0xff];
|
||||
}
|
||||
|
||||
void spu_interpreter::FSMB(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = g_imm_table.fsmb_table[CPU.GPR[op.ra]._u32[3] & 0xffff];
|
||||
CPU.GPR[op.rt] = g_spu_imm.fsmb[CPU.GPR[op.ra]._u32[3] & 0xffff];
|
||||
}
|
||||
|
||||
void spu_interpreter::FREST(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -436,17 +416,17 @@ void spu_interpreter::LQX(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::ROTQBYBI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0xf]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBYBI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-(CPU.GPR[op.rb]._s32[3] >> 3) & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-(CPU.GPR[op.rb]._s32[3] >> 3) & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBYBI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::CBX(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -500,17 +480,17 @@ void spu_interpreter::SHLQBI(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::ROTQBY(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0xf]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBY(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-CPU.GPR[op.rb]._s32[3] & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-CPU.GPR[op.rb]._s32[3] & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBY(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ORX(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -569,17 +549,17 @@ void spu_interpreter::SHLQBII(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::ROTQBYI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[op.i7 & 0xf]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::ROTQMBYI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-op.si7 & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-op.si7 & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::SHLQBYI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[op.i7 & 0x1f]);
|
||||
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
|
||||
}
|
||||
|
||||
void spu_interpreter::NOP(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -770,9 +750,9 @@ void spu_interpreter::CEQ(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::MPYHHU(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto a = _mm_srli_epi32(CPU.GPR[op.ra].vi, 16);
|
||||
const auto b = _mm_srli_epi32(CPU.GPR[op.rb].vi, 16);
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b));
|
||||
const auto a = CPU.GPR[op.ra].vi;
|
||||
const auto b = CPU.GPR[op.rb].vi;
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000)));
|
||||
}
|
||||
|
||||
void spu_interpreter::ADDX(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -810,9 +790,9 @@ void spu_interpreter::MPYHHA(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::MPYHHAU(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto a = _mm_srli_epi32(CPU.GPR[op.ra].vi, 16);
|
||||
const auto b = _mm_srli_epi32(CPU.GPR[op.rb].vi, 16);
|
||||
CPU.GPR[op.rt].vi = _mm_add_epi32(CPU.GPR[op.rt].vi, _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b)));
|
||||
const auto a = CPU.GPR[op.ra].vi;
|
||||
const auto b = CPU.GPR[op.rb].vi;
|
||||
CPU.GPR[op.rt].vi = _mm_add_epi32(CPU.GPR[op.rt].vi, _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000))));
|
||||
}
|
||||
|
||||
void spu_interpreter::FSCRRD(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -890,9 +870,9 @@ void spu_interpreter::DFCMEQ(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::MPYU(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto a = _mm_and_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0xffff));
|
||||
const auto b = _mm_and_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0xffff));
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b));
|
||||
const auto a = CPU.GPR[op.ra].vi;
|
||||
const auto b = CPU.GPR[op.rb].vi;
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi32(0xffff)));
|
||||
}
|
||||
|
||||
void spu_interpreter::CEQB(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -907,8 +887,8 @@ void spu_interpreter::FI(SPUThread& CPU, spu_opcode_t op)
|
||||
const auto mask_sf = _mm_set1_epi32(0x000003ff); // step fraction mask
|
||||
const auto mask_yf = _mm_set1_epi32(0x0007ffff); // Y fraction mask (bits 13..31)
|
||||
const auto base = _mm_or_ps(_mm_and_ps(CPU.GPR[op.rb].vf, mask_bf), _mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
|
||||
const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.rb].vi, mask_sf)), g_spu_scale_table[-13]);
|
||||
const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.ra].vi, mask_yf)), g_spu_scale_table[-19]);
|
||||
const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.rb].vi, mask_sf)), _mm_set1_ps(exp2f(-13)));
|
||||
const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.ra].vi, mask_yf)), _mm_set1_ps(exp2f(-19)));
|
||||
CPU.GPR[op.rt].vf = _mm_or_ps(_mm_and_ps(mask_se, CPU.GPR[op.rb].vf), _mm_andnot_ps(mask_se, _mm_sub_ps(base, _mm_mul_ps(step, y))));
|
||||
}
|
||||
|
||||
@ -923,27 +903,27 @@ void spu_interpreter::HEQ(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::CFLTS(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto scaled = _mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_scale_table[173 - op.i8]);
|
||||
const auto scaled = _mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_imm.scale[173 - op.i8]);
|
||||
CPU.GPR[op.rt].vi = _mm_xor_si128(_mm_cvttps_epi32(scaled), _mm_castps_si128(_mm_cmpge_ps(scaled, _mm_set1_ps(0x80000000))));
|
||||
}
|
||||
|
||||
void spu_interpreter::CFLTU(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto scaled1 = _mm_max_ps(_mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_scale_table[173 - op.i8]), _mm_set1_ps(0.0f));
|
||||
const auto scaled1 = _mm_max_ps(_mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_imm.scale[173 - op.i8]), _mm_set1_ps(0.0f));
|
||||
const auto scaled2 = _mm_and_ps(_mm_sub_ps(scaled1, _mm_set1_ps(0x80000000)), _mm_cmpge_ps(scaled1, _mm_set1_ps(0x80000000)));
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_or_si128(_mm_cvttps_epi32(scaled1), _mm_cvttps_epi32(scaled2)), _mm_castps_si128(_mm_cmpge_ps(scaled1, _mm_set1_ps(0x100000000))));
|
||||
}
|
||||
|
||||
void spu_interpreter::CSFLT(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_cvtepi32_ps(CPU.GPR[op.ra].vi), g_spu_scale_table[op.i8 - 155]);
|
||||
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_cvtepi32_ps(CPU.GPR[op.ra].vi), g_spu_imm.scale[op.i8 - 155]);
|
||||
}
|
||||
|
||||
void spu_interpreter::CUFLT(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto a = CPU.GPR[op.ra].vi;
|
||||
const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(a, 31)), _mm_set1_ps(0x80000000));
|
||||
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_scale_table[op.i8 - 155]);
|
||||
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_imm.scale[op.i8 - 155]);
|
||||
}
|
||||
|
||||
|
||||
@ -1013,7 +993,7 @@ void spu_interpreter::BR(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::FSMBI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
CPU.GPR[op.rt].vi = g_imm_table.fsmb_table[op.i16];
|
||||
CPU.GPR[op.rt] = g_spu_imm.fsmb[op.i16];
|
||||
}
|
||||
|
||||
void spu_interpreter::BRSL(SPUThread& CPU, spu_opcode_t op)
|
||||
@ -1177,7 +1157,7 @@ void spu_interpreter::MPYI(SPUThread& CPU, spu_opcode_t op)
|
||||
|
||||
void spu_interpreter::MPYUI(SPUThread& CPU, spu_opcode_t op)
|
||||
{
|
||||
const auto a = _mm_and_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0xffff));
|
||||
const auto a = CPU.GPR[op.ra].vi;
|
||||
const auto i = _mm_set1_epi32(op.si10 & 0xffff);
|
||||
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, i), 16), _mm_mullo_epi16(a, i));
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -13,8 +13,6 @@
|
||||
#include "SPUInterpreter.h"
|
||||
#include "SPURecompiler.h"
|
||||
|
||||
const g_imm_table_struct g_imm_table;
|
||||
|
||||
SPURecompilerCore::SPURecompilerCore(SPUThread& cpu)
|
||||
: m_enc(new SPURecompiler(cpu, *this))
|
||||
, inter(new SPUInterpreter(cpu))
|
||||
@ -25,11 +23,6 @@ SPURecompilerCore::SPURecompilerCore(SPUThread& cpu)
|
||||
memset(entry, 0, sizeof(entry));
|
||||
X86CpuInfo inf;
|
||||
X86CpuUtil::detect(&inf);
|
||||
if (!inf.hasFeature(kX86CpuFeatureSSE4_1))
|
||||
{
|
||||
LOG_ERROR(SPU, "SPU JIT requires SSE4.1 instruction set support");
|
||||
Emu.Pause();
|
||||
}
|
||||
}
|
||||
|
||||
SPURecompilerCore::~SPURecompilerCore()
|
||||
@ -51,12 +44,12 @@ void SPURecompilerCore::Compile(u16 pos)
|
||||
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
|
||||
dis_asm.offset = vm::get_ptr<u8>(CPU.offset);
|
||||
|
||||
StringLogger stringLogger;
|
||||
stringLogger.setOption(kLoggerOptionBinaryForm, true);
|
||||
//StringLogger stringLogger;
|
||||
//stringLogger.setOption(kLoggerOptionBinaryForm, true);
|
||||
|
||||
X86Compiler compiler(&runtime);
|
||||
m_enc->compiler = &compiler;
|
||||
compiler.setLogger(&stringLogger);
|
||||
//compiler.setLogger(&stringLogger);
|
||||
|
||||
compiler.addFunc(kFuncConvHost, FuncBuilder4<u32, void*, void*, void*, u32>());
|
||||
const u16 start = pos;
|
||||
@ -154,28 +147,22 @@ void SPURecompilerCore::Compile(u16 pos)
|
||||
entry[start].pointer = compiler.make();
|
||||
compiler.setLogger(nullptr); // crashes without it
|
||||
|
||||
rFile log;
|
||||
log.Open(fmt::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? rFile::write : rFile::write_append);
|
||||
log.Write(fmt::Format("========== START POSITION 0x%x ==========\n\n", start * 4));
|
||||
log.Write(std::string(stringLogger.getString()));
|
||||
if (!entry[start].pointer)
|
||||
{
|
||||
LOG_ERROR(Log::SPU, "SPURecompilerCore::Compile(pos=0x%x) failed", start * sizeof(u32));
|
||||
log.Write("========== FAILED ============\n\n");
|
||||
Emu.Pause();
|
||||
}
|
||||
else
|
||||
{
|
||||
log.Write(fmt::Format("========== COMPILED %d (excess %d), time: [start=%lld (decoding=%lld), finalize=%lld]\n\n",
|
||||
entry[start].count, excess, stamp1 - stamp0, time0, get_system_time() - stamp1));
|
||||
#ifdef _WIN32
|
||||
//if (!RtlAddFunctionTable(&info, 1, (u64)entry[start].pointer))
|
||||
//{
|
||||
// LOG_ERROR(Log::SPU, "RtlAddFunctionTable() failed");
|
||||
//}
|
||||
#endif
|
||||
}
|
||||
log.Close();
|
||||
//rFile log;
|
||||
//log.Open(fmt::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? rFile::write : rFile::write_append);
|
||||
//log.Write(fmt::Format("========== START POSITION 0x%x ==========\n\n", start * 4));
|
||||
//log.Write(std::string(stringLogger.getString()));
|
||||
//if (!entry[start].pointer)
|
||||
//{
|
||||
// LOG_ERROR(Log::SPU, "SPURecompilerCore::Compile(pos=0x%x) failed", start * sizeof(u32));
|
||||
// log.Write("========== FAILED ============\n\n");
|
||||
// Emu.Pause();
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
// log.Write(fmt::Format("========== COMPILED %d (excess %d), time: [start=%lld (decoding=%lld), finalize=%lld]\n\n",
|
||||
// entry[start].count, excess, stamp1 - stamp0, time0, get_system_time() - stamp1));
|
||||
//}
|
||||
//log.Close();
|
||||
m_enc->compiler = nullptr;
|
||||
first = false;
|
||||
}
|
||||
@ -217,9 +204,6 @@ u32 SPURecompilerCore::DecodeMemory(const u32 address)
|
||||
i < (u32)pos + (u32)entry[pos].count)
|
||||
{
|
||||
runtime.release(entry[i].pointer);
|
||||
#ifdef _WIN32
|
||||
//RtlDeleteFunctionTable(&entry[i].info);
|
||||
#endif
|
||||
entry[i].pointer = nullptr;
|
||||
for (u32 j = i; j < i + (u32)entry[i].count; j++)
|
||||
{
|
||||
@ -264,7 +248,7 @@ u32 SPURecompilerCore::DecodeMemory(const u32 address)
|
||||
}
|
||||
|
||||
u32 res = pos;
|
||||
res = func(cpu, vm::get_ptr<void>(m_offset), imm_table.data(), &g_imm_table);
|
||||
res = func(cpu, vm::get_ptr<void>(m_offset), imm_table.data(), &g_spu_imm);
|
||||
|
||||
if (res & 0x1000000)
|
||||
{
|
||||
|
@ -22,6 +22,8 @@
|
||||
|
||||
#include <cfenv>
|
||||
|
||||
const g_spu_imm_table_t g_spu_imm;
|
||||
|
||||
class spu_inter_func_list_t
|
||||
{
|
||||
std::array<spu_inter_func_t, 2048> funcs;
|
||||
|
@ -315,51 +315,89 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#define mmToU64Ptr(x) ((u64*)(&x))
|
||||
#define mmToU32Ptr(x) ((u32*)(&x))
|
||||
#define mmToU16Ptr(x) ((u16*)(&x))
|
||||
#define mmToU8Ptr(x) ((u8*)(&x))
|
||||
|
||||
struct g_imm_table_struct
|
||||
struct g_spu_imm_table_t
|
||||
{
|
||||
__m128i fsmb_table[65536];
|
||||
__m128i fsmh_table[256];
|
||||
__m128i fsm_table[16];
|
||||
u128 fsmb[65536]; // table for FSMB, FSMBI instructions
|
||||
u128 fsmh[256]; // table for FSMH instruction
|
||||
u128 fsm[16]; // table for FSM instruction
|
||||
|
||||
__m128i sldq_pshufb[32];
|
||||
__m128i srdq_pshufb[32];
|
||||
__m128i rldq_pshufb[16];
|
||||
u128 sldq_pshufb[32]; // table for SHLQBYBI, SHLQBY, SHLQBYI instructions
|
||||
u128 srdq_pshufb[32]; // table for ROTQMBYBI, ROTQMBY, ROTQMBYI instructions
|
||||
u128 rldq_pshufb[16]; // table for ROTQBYBI, ROTQBY, ROTQBYI instructions
|
||||
|
||||
g_imm_table_struct()
|
||||
class scale_table_t
|
||||
{
|
||||
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
|
||||
std::array<__m128, 155 + 174> m_data;
|
||||
|
||||
public:
|
||||
scale_table_t()
|
||||
{
|
||||
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
for (s32 i = -155; i < 174; i++)
|
||||
{
|
||||
m_data[i + 155] = _mm_set1_ps(static_cast<float>(exp2(i)));
|
||||
}
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
|
||||
|
||||
__forceinline __m128 operator [] (s32 scale) const
|
||||
{
|
||||
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
return m_data[scale + 155];
|
||||
}
|
||||
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
|
||||
}
|
||||
const scale;
|
||||
|
||||
g_spu_imm_table_t()
|
||||
{
|
||||
for (u32 i = 0; i < sizeof(fsm) / sizeof(fsm[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
|
||||
for (u32 j = 0; j < 4; j++)
|
||||
{
|
||||
fsm[i]._u32[j] = (i & (1 << j)) ? 0xffffffff : 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < sizeof(fsmh) / sizeof(fsmh[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 8; j++)
|
||||
{
|
||||
fsmh[i]._u16[j] = (i & (1 << j)) ? 0xffff : 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < sizeof(fsmb) / sizeof(fsmb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++)
|
||||
{
|
||||
fsmb[i]._u8[j] = (i & (1 << j)) ? 0xff : 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
|
||||
for (u32 j = 0; j < 16; j++)
|
||||
{
|
||||
sldq_pshufb[i]._u8[j] = static_cast<u8>(j - i);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
|
||||
for (u32 j = 0; j < 16; j++)
|
||||
{
|
||||
srdq_pshufb[i]._u8[j] = (j + i > 15) ? 0xff : static_cast<u8>(j + i);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
|
||||
{
|
||||
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
|
||||
for (u32 j = 0; j < 16; j++)
|
||||
{
|
||||
rldq_pshufb[i]._u8[j] = static_cast<u8>((j - i) & 0xf);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
extern const g_imm_table_struct g_imm_table;
|
||||
extern const g_spu_imm_table_t g_spu_imm;
|
||||
|
||||
enum FPSCR_EX
|
||||
{
|
||||
|
@ -96,10 +96,10 @@ s32 cellAudioInit()
|
||||
u16 buf_u16[out_buffer_size];
|
||||
for (size_t i = 0; i < out_buffer_size; i += 8)
|
||||
{
|
||||
static const __m128 float2u16 = { 0x8000, 0x8000, 0x8000, 0x8000 };
|
||||
const auto scale = _mm_set1_ps(0x8000);
|
||||
(__m128i&)(buf_u16[i]) = _mm_packs_epi32(
|
||||
_mm_cvtps_epi32(_mm_mul_ps((__m128&)(buffer[i]), float2u16)),
|
||||
_mm_cvtps_epi32(_mm_mul_ps((__m128&)(buffer[i + 4]), float2u16)));
|
||||
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(buffer + i), scale)),
|
||||
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(buffer + i + 4), scale)));
|
||||
}
|
||||
|
||||
if (!opened)
|
||||
|
Loading…
Reference in New Issue
Block a user