From 209b14fbaccabcab16722d165d887762b167bc93 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 25 Nov 2021 21:15:24 +0300 Subject: [PATCH] PPU LLVM: inline remaining vector instructions --- rpcs3/Emu/Cell/PPUThread.cpp | 27 ++------------ rpcs3/Emu/Cell/PPUTranslator.cpp | 60 +++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 7fa41029e8..ebcd04cd13 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -551,7 +551,7 @@ struct ppu_far_jumps_t auto& opd = vm::_ref(target); target = opd.addr; - // We modify LR to custom values here + // We modify LR to custom values here link = false; if (ppu) @@ -566,7 +566,7 @@ struct ppu_far_jumps_t saved_info.saved_lr = std::exchange(ppu->lr, FIND_FUNC(ppu_return_from_far_jump)); saved_info.saved_r2 = std::exchange(ppu->gpr[2], opd.rtoc); } - + } if (link && ppu) @@ -1641,21 +1641,6 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept extern ppu_function_t ppu_get_syscall(u64 code); -extern __m128 sse_exp2_ps(__m128 A); -extern __m128 sse_log2_ps(__m128 A); -extern __m128i sse_altivec_vperm(__m128i A, __m128i B, __m128i C); -extern __m128i sse_altivec_vperm_v0(__m128i A, __m128i B, __m128i C); -extern __m128i sse_altivec_lvsl(u64 addr); -extern __m128i sse_altivec_lvsr(u64 addr); -extern __m128i sse_cellbe_lvlx(u64 addr); -extern __m128i sse_cellbe_lvrx(u64 addr); -extern void sse_cellbe_stvlx(u64 addr, __m128i a); -extern void sse_cellbe_stvrx(u64 addr, __m128i a); -extern __m128i sse_cellbe_lvlx_v0(u64 addr); -extern __m128i sse_cellbe_lvrx_v0(u64 addr); -extern void sse_cellbe_stvlx_v0(u64 addr, __m128i a); -extern void sse_cellbe_stvrx_v0(u64 addr, __m128i a); - void ppu_trap(ppu_thread& ppu, u64 addr) { ensure((addr & (~u64{0xffff'ffff} | 0x3)) == 0); @@ -2999,14 +2984,6 @@ bool ppu_initialize(const ppu_module& info, bool check_only) { "__ldarx", reinterpret_cast(ppu_ldarx) }, { "__stwcx", reinterpret_cast(ppu_stwcx) }, { "__stdcx", reinterpret_cast(ppu_stdcx) }, - { "__vexptefp", reinterpret_cast(sse_exp2_ps) }, - { "__vlogefp", reinterpret_cast(sse_log2_ps) }, - { "__lvsl", reinterpret_cast(sse_altivec_lvsl) }, - { "__lvsr", reinterpret_cast(sse_altivec_lvsr) }, - { "__lvlx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_lvlx) : reinterpret_cast(sse_cellbe_lvlx_v0) }, - { "__lvrx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_lvrx) : reinterpret_cast(sse_cellbe_lvrx_v0) }, - { "__stvlx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_stvlx) : reinterpret_cast(sse_cellbe_stvlx_v0) }, - { "__stvrx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_stvrx) : reinterpret_cast(sse_cellbe_stvrx_v0) }, { "__dcbz", reinterpret_cast(+[](u32 addr){ alignas(64) static constexpr u8 z[128]{}; do_cell_atomic_128_store(addr, z); }) }, { "__resupdate", reinterpret_cast(vm::reservation_update) }, { "__resinterp", reinterpret_cast(ppu_reservation_fallback) }, diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 4c2f139fd8..62392ee844 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -922,18 +922,34 @@ void PPUTranslator::VCTUXS(ppu_opcode_t op) SetSat(IsNotZero(eval(sat_l | sat_h).value)); } -extern __m128 sse_exp2_ps(__m128); - void PPUTranslator::VEXPTEFP(ppu_opcode_t op) { - set_vr(op.vd, call("__vexptefp", &sse_exp2_ps, get_vr(op.vb))); + const auto b = get_vr(op.vb); + const auto x0 = eval(fmax(fmin(b, fsplat(127.4999961f)), fsplat(-127.4999961f))); + const auto x1 = eval(x0 + fsplat(0.5f)); + const auto x2 = eval(llvm_calli{"llvm.x86.sse2.cvtps2dq", {x1}} - noncast(zext(fcmp_ord(x1 <= fsplat(0))))); + const auto x3 = eval(x0 - fpcast(x2)); + const auto x4 = eval(x3 * x3); + const auto x5 = eval(x3 * fmuladd(fmuladd(x4, fsplat(0.023093347705f), fsplat(20.20206567f)), x4, fsplat(1513.906801f))); + const auto x6 = eval(x5 * fre(fmuladd(x4, fsplat(233.1842117f), fsplat(4368.211667f)) - x5)); + set_vr(op.vd, (x6 + x6 + fsplat(1.0f)) * bitcast((x2 + 127) << 23)); } -extern __m128 sse_log2_ps(__m128); - void PPUTranslator::VLOGEFP(ppu_opcode_t op) { - set_vr(op.vd, call("__vlogefp", &sse_log2_ps, get_vr(op.vb))); + const auto b = get_vr(op.vb); + const auto _1 = fsplat(1.0f); + const auto _c = fsplat(1.442695040f); + const auto x0 = eval(fmax(b, bitcast(splat(0x00800000)))); + const auto x1 = eval(bitcast((bitcast(x0) & 0x807fffff) | bitcast(_1))); + const auto x2 = eval(fre(x1 + _1)); + const auto x3 = eval((x1 - _1) * x2); + const auto x4 = eval(x3 + x3); + const auto x5 = eval(x4 * x4); + const auto x6 = eval(fmuladd(fmuladd(x5, fsplat(-0.7895802789f), fsplat(16.38666457f)), x5, fsplat(-64.1409953f))); + const auto x7 = eval(fre(fmuladd(fmuladd(x5, fsplat(-35.67227983f), fsplat(312.0937664f)), x5, fsplat(-769.6919436f)))); + const auto x8 = eval(fpcast(bitcast((bitcast(x0) >> 23) - 127))); + set_vr(op.vd, fmuladd(x5 * x6 * x7 * x4, _c, fmuladd(x4, _c, x8))); } void PPUTranslator::VMADDFP(ppu_opcode_t op) @@ -2406,11 +2422,8 @@ void PPUTranslator::TW(ppu_opcode_t op) void PPUTranslator::LVSL(ppu_opcode_t op) { - const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); - //const auto _add = m_ir->CreateInsertElement(ConstantVector::getSplat(16, m_ir->getInt8(0)), Trunc(m_ir->CreateAnd(addr, 0xf), GetType()), m_ir->getInt32(0)); - //const auto base = ConstantDataVector::get(m_context, std::vector{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); - //SetVr(op.vd, m_ir->CreateAdd(base, Shuffle(_add, nullptr, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}))); - SetVr(op.vd, Call(GetType(), m_pure_attr, "__lvsl", addr)); + const auto addr = value(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb)); + set_vr(op.vd, build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + vsplat(trunc(addr & 0xf))); } void PPUTranslator::LVEBX(ppu_opcode_t op) @@ -2570,11 +2583,8 @@ void PPUTranslator::CMPL(ppu_opcode_t op) void PPUTranslator::LVSR(ppu_opcode_t op) { - const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); - //const auto _add = m_ir->CreateInsertElement(ConstantVector::getSplat(16, m_ir->getInt8(0)), Trunc(m_ir->CreateAnd(addr, 0xf), GetType()), m_ir->getInt32(0)); - //const auto base = ConstantDataVector::get(m_context, std::vector{31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}); - //SetVr(op.vd, m_ir->CreateSub(base, Shuffle(_add, nullptr, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}))); - SetVr(op.vd, Call(GetType(), m_pure_attr, "__lvsr", addr)); + const auto addr = value(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb)); + set_vr(op.vd, build(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16) - vsplat(trunc(addr & 0xf))); } void PPUTranslator::LVEHX(ppu_opcode_t op) @@ -3191,7 +3201,8 @@ void PPUTranslator::DIVW(ppu_opcode_t op) void PPUTranslator::LVLX(ppu_opcode_t op) { const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); - SetVr(op.vd, Call(GetType(), "__lvlx", addr)); + const auto data = ReadMemory(m_ir->CreateAnd(addr, ~0xfull), GetType(), m_is_be, 16); + set_vr(op.vd, pshufb(value(data), build(127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112) + vsplat(trunc(value(addr) & 0xf)))); } void PPUTranslator::LDBRX(ppu_opcode_t op) @@ -3235,7 +3246,8 @@ void PPUTranslator::SRD(ppu_opcode_t op) void PPUTranslator::LVRX(ppu_opcode_t op) { const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); - SetVr(op.vd, Call(GetType(), "__lvrx", addr)); + const auto data = ReadMemory(m_ir->CreateAnd(addr, ~0xfull), GetType(), m_is_be, 16); + set_vr(op.vd, pshufb(value(data), build(255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240) + vsplat(trunc(value(addr) & 0xf)))); } void PPUTranslator::LSWI(ppu_opcode_t op) @@ -3308,7 +3320,11 @@ void PPUTranslator::LFDUX(ppu_opcode_t op) void PPUTranslator::STVLX(ppu_opcode_t op) { - Call(GetType(), "__stvlx", op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), GetVr(op.vs, VrType::vi8)); + const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); + const auto data = pshufb(get_vr(op.vs), build(127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112) + vsplat(trunc(value(addr) & 0xf))); + const auto mask = sext(bitcast(splat(0xffff) << trunc(value(addr) & 0xf))); + const auto ptr = value(GetMemory(m_ir->CreateAnd(addr, ~0xfull), GetType())); + eval(llvm_calli{"llvm.x86.sse2.maskmov.dqu", {data, mask, ptr}}); } void PPUTranslator::STDBRX(ppu_opcode_t op) @@ -3333,7 +3349,11 @@ void PPUTranslator::STFSX(ppu_opcode_t op) void PPUTranslator::STVRX(ppu_opcode_t op) { - Call(GetType(), "__stvrx", op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), GetVr(op.vs, VrType::vi8)); + const auto addr = op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb); + const auto data = pshufb(get_vr(op.vs), build(255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240) + vsplat(trunc(value(addr) & 0xf))); + const auto mask = sext(bitcast(trunc(splat(0xffff) << (value(addr) & 0xf) >> 16))); + const auto ptr = value(GetMemory(m_ir->CreateAnd(addr, ~0xfull), GetType())); + eval(llvm_calli{"llvm.x86.sse2.maskmov.dqu", {data, mask, ptr}}); } void PPUTranslator::STFSUX(ppu_opcode_t op)