SPURecompiler improved

This commit is contained in:
Nekotekina 2015-04-05 21:01:15 +03:00
parent d2883fc554
commit d1fbccc9ce
6 changed files with 394 additions and 372 deletions

View File

@ -18,26 +18,6 @@
#define rotl32(x,r) (((u32)(x) << (r)) | ((u32)(x) >> (32 - (r))))
#endif
class spu_scale_table_t
{
std::array<__m128, 155 + 174> m_data;
public:
spu_scale_table_t()
{
for (s32 i = -155; i < 174; i++)
{
m_data[i + 155] = _mm_set1_ps(static_cast<float>(exp2(i)));
}
}
__forceinline __m128 operator [] (s32 scale) const
{
return m_data[scale + 155];
}
}
const g_spu_scale_table;
void spu_interpreter::DEFAULT(SPUThread& CPU, spu_opcode_t op)
{
@ -405,17 +385,17 @@ void spu_interpreter::GBB(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::FSM(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = g_imm_table.fsm_table[CPU.GPR[op.ra]._u32[3] & 0xf];
CPU.GPR[op.rt] = g_spu_imm.fsm[CPU.GPR[op.ra]._u32[3] & 0xf];
}
void spu_interpreter::FSMH(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = g_imm_table.fsmh_table[CPU.GPR[op.ra]._u32[3] & 0xff];
CPU.GPR[op.rt] = g_spu_imm.fsmh[CPU.GPR[op.ra]._u32[3] & 0xff];
}
void spu_interpreter::FSMB(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = g_imm_table.fsmb_table[CPU.GPR[op.ra]._u32[3] & 0xffff];
CPU.GPR[op.rt] = g_spu_imm.fsmb[CPU.GPR[op.ra]._u32[3] & 0xffff];
}
void spu_interpreter::FREST(SPUThread& CPU, spu_opcode_t op)
@ -436,17 +416,17 @@ void spu_interpreter::LQX(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::ROTQBYBI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0xf]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0xf].vi);
}
void spu_interpreter::ROTQMBYBI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-(CPU.GPR[op.rb]._s32[3] >> 3) & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-(CPU.GPR[op.rb]._s32[3] >> 3) & 0x1f].vi);
}
void spu_interpreter::SHLQBYBI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[CPU.GPR[op.rb]._u32[3] >> 3 & 0x1f].vi);
}
void spu_interpreter::CBX(SPUThread& CPU, spu_opcode_t op)
@ -500,17 +480,17 @@ void spu_interpreter::SHLQBI(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::ROTQBY(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0xf]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0xf].vi);
}
void spu_interpreter::ROTQMBY(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-CPU.GPR[op.rb]._s32[3] & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-CPU.GPR[op.rb]._s32[3] & 0x1f].vi);
}
void spu_interpreter::SHLQBY(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[CPU.GPR[op.rb]._u32[3] & 0x1f].vi);
}
void spu_interpreter::ORX(SPUThread& CPU, spu_opcode_t op)
@ -569,17 +549,17 @@ void spu_interpreter::SHLQBII(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::ROTQBYI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.rldq_pshufb[op.i7 & 0xf]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
}
void spu_interpreter::ROTQMBYI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.srdq_pshufb[-op.si7 & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.srdq_pshufb[-op.si7 & 0x1f].vi);
}
void spu_interpreter::SHLQBYI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_imm_table.sldq_pshufb[op.i7 & 0x1f]);
CPU.GPR[op.rt].vi = _mm_shuffle_epi8(CPU.GPR[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
}
void spu_interpreter::NOP(SPUThread& CPU, spu_opcode_t op)
@ -770,9 +750,9 @@ void spu_interpreter::CEQ(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::MPYHHU(SPUThread& CPU, spu_opcode_t op)
{
const auto a = _mm_srli_epi32(CPU.GPR[op.ra].vi, 16);
const auto b = _mm_srli_epi32(CPU.GPR[op.rb].vi, 16);
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b));
const auto a = CPU.GPR[op.ra].vi;
const auto b = CPU.GPR[op.rb].vi;
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000)));
}
void spu_interpreter::ADDX(SPUThread& CPU, spu_opcode_t op)
@ -810,9 +790,9 @@ void spu_interpreter::MPYHHA(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::MPYHHAU(SPUThread& CPU, spu_opcode_t op)
{
const auto a = _mm_srli_epi32(CPU.GPR[op.ra].vi, 16);
const auto b = _mm_srli_epi32(CPU.GPR[op.rb].vi, 16);
CPU.GPR[op.rt].vi = _mm_add_epi32(CPU.GPR[op.rt].vi, _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b)));
const auto a = CPU.GPR[op.ra].vi;
const auto b = CPU.GPR[op.rb].vi;
CPU.GPR[op.rt].vi = _mm_add_epi32(CPU.GPR[op.rt].vi, _mm_or_si128(_mm_srli_epi32(_mm_mullo_epi16(a, b), 16), _mm_and_si128(_mm_mulhi_epu16(a, b), _mm_set1_epi32(0xffff0000))));
}
void spu_interpreter::FSCRRD(SPUThread& CPU, spu_opcode_t op)
@ -890,9 +870,9 @@ void spu_interpreter::DFCMEQ(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::MPYU(SPUThread& CPU, spu_opcode_t op)
{
const auto a = _mm_and_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0xffff));
const auto b = _mm_and_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0xffff));
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_mullo_epi16(a, b));
const auto a = CPU.GPR[op.ra].vi;
const auto b = CPU.GPR[op.rb].vi;
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, b), 16), _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi32(0xffff)));
}
void spu_interpreter::CEQB(SPUThread& CPU, spu_opcode_t op)
@ -907,8 +887,8 @@ void spu_interpreter::FI(SPUThread& CPU, spu_opcode_t op)
const auto mask_sf = _mm_set1_epi32(0x000003ff); // step fraction mask
const auto mask_yf = _mm_set1_epi32(0x0007ffff); // Y fraction mask (bits 13..31)
const auto base = _mm_or_ps(_mm_and_ps(CPU.GPR[op.rb].vf, mask_bf), _mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.rb].vi, mask_sf)), g_spu_scale_table[-13]);
const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.ra].vi, mask_yf)), g_spu_scale_table[-19]);
const auto step = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.rb].vi, mask_sf)), _mm_set1_ps(exp2f(-13)));
const auto y = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(CPU.GPR[op.ra].vi, mask_yf)), _mm_set1_ps(exp2f(-19)));
CPU.GPR[op.rt].vf = _mm_or_ps(_mm_and_ps(mask_se, CPU.GPR[op.rb].vf), _mm_andnot_ps(mask_se, _mm_sub_ps(base, _mm_mul_ps(step, y))));
}
@ -923,27 +903,27 @@ void spu_interpreter::HEQ(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::CFLTS(SPUThread& CPU, spu_opcode_t op)
{
const auto scaled = _mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_scale_table[173 - op.i8]);
const auto scaled = _mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_imm.scale[173 - op.i8]);
CPU.GPR[op.rt].vi = _mm_xor_si128(_mm_cvttps_epi32(scaled), _mm_castps_si128(_mm_cmpge_ps(scaled, _mm_set1_ps(0x80000000))));
}
void spu_interpreter::CFLTU(SPUThread& CPU, spu_opcode_t op)
{
const auto scaled1 = _mm_max_ps(_mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_scale_table[173 - op.i8]), _mm_set1_ps(0.0f));
const auto scaled1 = _mm_max_ps(_mm_mul_ps(CPU.GPR[op.ra].vf, g_spu_imm.scale[173 - op.i8]), _mm_set1_ps(0.0f));
const auto scaled2 = _mm_and_ps(_mm_sub_ps(scaled1, _mm_set1_ps(0x80000000)), _mm_cmpge_ps(scaled1, _mm_set1_ps(0x80000000)));
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_or_si128(_mm_cvttps_epi32(scaled1), _mm_cvttps_epi32(scaled2)), _mm_castps_si128(_mm_cmpge_ps(scaled1, _mm_set1_ps(0x100000000))));
}
void spu_interpreter::CSFLT(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_cvtepi32_ps(CPU.GPR[op.ra].vi), g_spu_scale_table[op.i8 - 155]);
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_cvtepi32_ps(CPU.GPR[op.ra].vi), g_spu_imm.scale[op.i8 - 155]);
}
void spu_interpreter::CUFLT(SPUThread& CPU, spu_opcode_t op)
{
const auto a = CPU.GPR[op.ra].vi;
const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(a, 31)), _mm_set1_ps(0x80000000));
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_scale_table[op.i8 - 155]);
CPU.GPR[op.rt].vf = _mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(a, _mm_set1_epi32(0x7fffffff))), fix), g_spu_imm.scale[op.i8 - 155]);
}
@ -1013,7 +993,7 @@ void spu_interpreter::BR(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::FSMBI(SPUThread& CPU, spu_opcode_t op)
{
CPU.GPR[op.rt].vi = g_imm_table.fsmb_table[op.i16];
CPU.GPR[op.rt] = g_spu_imm.fsmb[op.i16];
}
void spu_interpreter::BRSL(SPUThread& CPU, spu_opcode_t op)
@ -1177,7 +1157,7 @@ void spu_interpreter::MPYI(SPUThread& CPU, spu_opcode_t op)
void spu_interpreter::MPYUI(SPUThread& CPU, spu_opcode_t op)
{
const auto a = _mm_and_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0xffff));
const auto a = CPU.GPR[op.ra].vi;
const auto i = _mm_set1_epi32(op.si10 & 0xffff);
CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(_mm_mulhi_epu16(a, i), 16), _mm_mullo_epi16(a, i));
}

File diff suppressed because it is too large Load Diff

View File

@ -13,8 +13,6 @@
#include "SPUInterpreter.h"
#include "SPURecompiler.h"
const g_imm_table_struct g_imm_table;
SPURecompilerCore::SPURecompilerCore(SPUThread& cpu)
: m_enc(new SPURecompiler(cpu, *this))
, inter(new SPUInterpreter(cpu))
@ -25,11 +23,6 @@ SPURecompilerCore::SPURecompilerCore(SPUThread& cpu)
memset(entry, 0, sizeof(entry));
X86CpuInfo inf;
X86CpuUtil::detect(&inf);
if (!inf.hasFeature(kX86CpuFeatureSSE4_1))
{
LOG_ERROR(SPU, "SPU JIT requires SSE4.1 instruction set support");
Emu.Pause();
}
}
SPURecompilerCore::~SPURecompilerCore()
@ -51,12 +44,12 @@ void SPURecompilerCore::Compile(u16 pos)
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
dis_asm.offset = vm::get_ptr<u8>(CPU.offset);
StringLogger stringLogger;
stringLogger.setOption(kLoggerOptionBinaryForm, true);
//StringLogger stringLogger;
//stringLogger.setOption(kLoggerOptionBinaryForm, true);
X86Compiler compiler(&runtime);
m_enc->compiler = &compiler;
compiler.setLogger(&stringLogger);
//compiler.setLogger(&stringLogger);
compiler.addFunc(kFuncConvHost, FuncBuilder4<u32, void*, void*, void*, u32>());
const u16 start = pos;
@ -154,28 +147,22 @@ void SPURecompilerCore::Compile(u16 pos)
entry[start].pointer = compiler.make();
compiler.setLogger(nullptr); // crashes without it
rFile log;
log.Open(fmt::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? rFile::write : rFile::write_append);
log.Write(fmt::Format("========== START POSITION 0x%x ==========\n\n", start * 4));
log.Write(std::string(stringLogger.getString()));
if (!entry[start].pointer)
{
LOG_ERROR(Log::SPU, "SPURecompilerCore::Compile(pos=0x%x) failed", start * sizeof(u32));
log.Write("========== FAILED ============\n\n");
Emu.Pause();
}
else
{
log.Write(fmt::Format("========== COMPILED %d (excess %d), time: [start=%lld (decoding=%lld), finalize=%lld]\n\n",
entry[start].count, excess, stamp1 - stamp0, time0, get_system_time() - stamp1));
#ifdef _WIN32
//if (!RtlAddFunctionTable(&info, 1, (u64)entry[start].pointer))
//{
// LOG_ERROR(Log::SPU, "RtlAddFunctionTable() failed");
//}
#endif
}
log.Close();
//rFile log;
//log.Open(fmt::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? rFile::write : rFile::write_append);
//log.Write(fmt::Format("========== START POSITION 0x%x ==========\n\n", start * 4));
//log.Write(std::string(stringLogger.getString()));
//if (!entry[start].pointer)
//{
// LOG_ERROR(Log::SPU, "SPURecompilerCore::Compile(pos=0x%x) failed", start * sizeof(u32));
// log.Write("========== FAILED ============\n\n");
// Emu.Pause();
//}
//else
//{
// log.Write(fmt::Format("========== COMPILED %d (excess %d), time: [start=%lld (decoding=%lld), finalize=%lld]\n\n",
// entry[start].count, excess, stamp1 - stamp0, time0, get_system_time() - stamp1));
//}
//log.Close();
m_enc->compiler = nullptr;
first = false;
}
@ -217,9 +204,6 @@ u32 SPURecompilerCore::DecodeMemory(const u32 address)
i < (u32)pos + (u32)entry[pos].count)
{
runtime.release(entry[i].pointer);
#ifdef _WIN32
//RtlDeleteFunctionTable(&entry[i].info);
#endif
entry[i].pointer = nullptr;
for (u32 j = i; j < i + (u32)entry[i].count; j++)
{
@ -264,7 +248,7 @@ u32 SPURecompilerCore::DecodeMemory(const u32 address)
}
u32 res = pos;
res = func(cpu, vm::get_ptr<void>(m_offset), imm_table.data(), &g_imm_table);
res = func(cpu, vm::get_ptr<void>(m_offset), imm_table.data(), &g_spu_imm);
if (res & 0x1000000)
{

View File

@ -22,6 +22,8 @@
#include <cfenv>
const g_spu_imm_table_t g_spu_imm;
class spu_inter_func_list_t
{
std::array<spu_inter_func_t, 2048> funcs;

View File

@ -315,51 +315,89 @@ public:
}
};
#define mmToU64Ptr(x) ((u64*)(&x))
#define mmToU32Ptr(x) ((u32*)(&x))
#define mmToU16Ptr(x) ((u16*)(&x))
#define mmToU8Ptr(x) ((u8*)(&x))
struct g_imm_table_struct
struct g_spu_imm_table_t
{
__m128i fsmb_table[65536];
__m128i fsmh_table[256];
__m128i fsm_table[16];
u128 fsmb[65536]; // table for FSMB, FSMBI instructions
u128 fsmh[256]; // table for FSMH instruction
u128 fsm[16]; // table for FSM instruction
__m128i sldq_pshufb[32];
__m128i srdq_pshufb[32];
__m128i rldq_pshufb[16];
u128 sldq_pshufb[32]; // table for SHLQBYBI, SHLQBY, SHLQBYI instructions
u128 srdq_pshufb[32]; // table for ROTQMBYBI, ROTQMBY, ROTQMBYI instructions
u128 rldq_pshufb[16]; // table for ROTQBYBI, ROTQBY, ROTQBYI instructions
g_imm_table_struct()
class scale_table_t
{
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
std::array<__m128, 155 + 174> m_data;
public:
scale_table_t()
{
for (u32 j = 0; j < 4; j++) mmToU32Ptr(fsm_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
for (s32 i = -155; i < 174; i++)
{
m_data[i + 155] = _mm_set1_ps(static_cast<float>(exp2(i)));
}
}
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
__forceinline __m128 operator [] (s32 scale) const
{
for (u32 j = 0; j < 8; j++) mmToU16Ptr(fsmh_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
return m_data[scale + 155];
}
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
}
const scale;
g_spu_imm_table_t()
{
for (u32 i = 0; i < sizeof(fsm) / sizeof(fsm[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(fsmb_table[i])[j] = (i & (1 << j)) ? ~0 : 0;
for (u32 j = 0; j < 4; j++)
{
fsm[i]._u32[j] = (i & (1 << j)) ? 0xffffffff : 0;
}
}
for (u32 i = 0; i < sizeof(fsmh) / sizeof(fsmh[0]); i++)
{
for (u32 j = 0; j < 8; j++)
{
fsmh[i]._u16[j] = (i & (1 << j)) ? 0xffff : 0;
}
}
for (u32 i = 0; i < sizeof(fsmb) / sizeof(fsmb[0]); i++)
{
for (u32 j = 0; j < 16; j++)
{
fsmb[i]._u8[j] = (i & (1 << j)) ? 0xff : 0;
}
}
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(sldq_pshufb[i])[j] = (u8)(j - i);
for (u32 j = 0; j < 16; j++)
{
sldq_pshufb[i]._u8[j] = static_cast<u8>(j - i);
}
}
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(srdq_pshufb[i])[j] = (j + i > 15) ? 0xff : (u8)(j + i);
for (u32 j = 0; j < 16; j++)
{
srdq_pshufb[i]._u8[j] = (j + i > 15) ? 0xff : static_cast<u8>(j + i);
}
}
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) mmToU8Ptr(rldq_pshufb[i])[j] = (u8)(j - i) & 0xf;
for (u32 j = 0; j < 16; j++)
{
rldq_pshufb[i]._u8[j] = static_cast<u8>((j - i) & 0xf);
}
}
}
};
extern const g_imm_table_struct g_imm_table;
extern const g_spu_imm_table_t g_spu_imm;
enum FPSCR_EX
{

View File

@ -96,10 +96,10 @@ s32 cellAudioInit()
u16 buf_u16[out_buffer_size];
for (size_t i = 0; i < out_buffer_size; i += 8)
{
static const __m128 float2u16 = { 0x8000, 0x8000, 0x8000, 0x8000 };
const auto scale = _mm_set1_ps(0x8000);
(__m128i&)(buf_u16[i]) = _mm_packs_epi32(
_mm_cvtps_epi32(_mm_mul_ps((__m128&)(buffer[i]), float2u16)),
_mm_cvtps_epi32(_mm_mul_ps((__m128&)(buffer[i + 4]), float2u16)));
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(buffer + i), scale)),
_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(buffer + i + 4), scale)));
}
if (!opened)