From abe498f35c298e55612bfd29a1bd51dc40ee797d Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Wed, 10 Feb 2021 20:20:14 +0300 Subject: [PATCH] PPU LLVM: modernize some code with new DSL PPU: rewritten instructions VMHADDSHS, VMHRADDSHS PPU: added optimized path for VPERM (ra=rb) --- rpcs3/Emu/Cell/PPUThread.cpp | 2 +- rpcs3/Emu/Cell/PPUTranslator.cpp | 505 ++++++++++++++----------------- rpcs3/Emu/Cell/PPUTranslator.h | 3 - 3 files changed, 223 insertions(+), 287 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 85f65431bd..6bbb3ff56d 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -3280,7 +3280,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only) settings += ppu_settings::greedy_mode; // Write version, hash, CPU, settings - fmt::append(obj_name, "v4-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); + fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); } if (Emu.IsStopped()) diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index a12f572b0a..90772c0518 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -547,26 +547,6 @@ std::pair PPUTranslator::SaturateSigned(Value* value, u64 min, u return{sat_h.first, m_ir->CreateOr(sat_l.second, sat_h.second)}; } -Value* PPUTranslator::Scale(Value* value, s32 scale) -{ - if (scale) - { - const auto type = value->getType(); - const auto power = std::pow(2, scale); - - if (auto v = dyn_cast(type)) - { - return m_ir->CreateFMul(value, ConstantVector::getSplat({v->getNumElements(), false}, ConstantFP::get(v->getElementType(), power))); - } - else - { - return m_ir->CreateFMul(value, ConstantFP::get(type, power)); - } - } - - return value; -} - Value* PPUTranslator::Shuffle(Value* left, Value* right, std::initializer_list indices) { const auto type = left->getType(); @@ -698,23 +678,19 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op) void PPUTranslator::VADDCUW(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); set_vr(op.vd, zext(a + b < a)); } void PPUTranslator::VADDFP(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - + const auto [a, b] = get_vrs(op.va, op.vb); set_vr(op.vd, vec_handle_result(a + b)); } void PPUTranslator::VADDSBS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -722,8 +698,7 @@ void PPUTranslator::VADDSBS(ppu_opcode_t op) void PPUTranslator::VADDSHS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -731,8 +706,7 @@ void PPUTranslator::VADDSHS(ppu_opcode_t op) void PPUTranslator::VADDSWS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -740,15 +714,13 @@ void PPUTranslator::VADDSWS(ppu_opcode_t op) void PPUTranslator::VADDUBM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a + b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a + b); } void PPUTranslator::VADDUBS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -756,15 +728,13 @@ void PPUTranslator::VADDUBS(ppu_opcode_t op) void PPUTranslator::VADDUHM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a + b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a + b); } void PPUTranslator::VADDUHS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -772,15 +742,13 @@ void PPUTranslator::VADDUHS(ppu_opcode_t op) void PPUTranslator::VADDUWM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a + b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a + b); } void PPUTranslator::VADDUWS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto r = add_sat(a, b); set_vr(op.vd, r); SetSat(IsNotZero(eval(r != (a + b)).value)); @@ -788,208 +756,194 @@ void PPUTranslator::VADDUWS(ppu_opcode_t op) void PPUTranslator::VAND(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a & b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a & b); } void PPUTranslator::VANDC(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a & ~b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a & ~b); } void PPUTranslator::VAVGSB(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VAVGSH(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VAVGSW(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VAVGUB(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VAVGUH(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VAVGUW(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(avg(a, b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, avg(a, b)); } void PPUTranslator::VCFSX(ppu_opcode_t op) { - const auto b = GetVr(op.vb, VrType::vi32); - SetVr(op.vd, Scale(m_ir->CreateSIToFP(b, GetType()), 0 - op.vuimm)); + const auto b = get_vr(op.vb); + set_vr(op.vd, fpcast(b) * fsplat(std::pow(2, -static_cast(op.vuimm)))); } void PPUTranslator::VCFUX(ppu_opcode_t op) { - const auto b = GetVr(op.vb, VrType::vi32); - SetVr(op.vd, Scale(m_ir->CreateUIToFP(b, GetType()), 0 - op.vuimm)); + const auto b = get_vr(op.vb); + set_vr(op.vd, fpcast(b) * fsplat(std::pow(2, -static_cast(op.vuimm)))); } void PPUTranslator::VCMPBFP(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vf, op.va, op.vb); - const auto nle = m_ir->CreateFCmpUGT(ab[0], ab[1]); - const auto nge = m_ir->CreateFCmpULT(ab[0], m_ir->CreateFNeg(ab[1])); - const auto le_bit = m_ir->CreateShl(ZExt(nle, GetType()), 31); - const auto ge_bit = m_ir->CreateShl(ZExt(nge, GetType()), 30); - const auto result = m_ir->CreateOr(le_bit, ge_bit); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, m_ir->getFalse(), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto nle = sext(fcmp_uno(a > b)) & 0x8000'0000; + const auto nge = sext(fcmp_uno(a < -b)) & 0x4000'0000; + const auto r = eval(nle | nge); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, m_ir->getFalse(), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPEQFP(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vf, op.va, op.vb); - const auto result = m_ir->CreateFCmpOEQ(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(fcmp_ord(a == b))); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPEQUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - const auto result = m_ir->CreateICmpEQ(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a == b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPEQUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - const auto result = m_ir->CreateICmpEQ(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a == b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPEQUW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - const auto result = m_ir->CreateICmpEQ(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a == b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGEFP(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vf, op.va, op.vb); - const auto result = m_ir->CreateFCmpOGE(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(fcmp_ord(a >= b))); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTFP(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vf, op.va, op.vb); - const auto result = m_ir->CreateFCmpOGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(fcmp_ord(a > b))); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTSB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - const auto result = m_ir->CreateICmpSGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTSH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - const auto result = m_ir->CreateICmpSGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTSW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - const auto result = m_ir->CreateICmpSGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - const auto result = m_ir->CreateICmpUGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - const auto result = m_ir->CreateICmpUGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } void PPUTranslator::VCMPGTUW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - const auto result = m_ir->CreateICmpUGT(ab[0], ab[1]); - SetVr(op.vd, result); - if (op.oe) SetCrField(6, IsOnes(result), m_ir->getFalse(), IsZero(result), m_ir->getFalse()); + const auto [a, b] = get_vrs(op.va, op.vb); + const auto r = eval(sext(a > b)); + set_vr(op.vd, r); + if (op.oe) SetCrField(6, IsOnes(r.value), m_ir->getFalse(), IsZero(r.value), m_ir->getFalse()); } -// TODO: remove this (wrong casts) -#define FP_SAT_OP(fcmp, value) m_ir->CreateSelect(fcmp, cast(cast(fcmp)->getOperand(1)), value) - void PPUTranslator::VCTSXS(ppu_opcode_t op) { - const auto b = GetVr(op.vb, VrType::vf); - const auto scaled = Scale(b, op.vuimm); - //const auto const0 = ConstantVector::getSplat(4, ConstantFP::get(GetType(), 0.0)); - const auto const1 = ConstantVector::getSplat({4, false}, ConstantFP::get(GetType(), -std::pow(2, 31))); - //const auto is_nan = m_ir->CreateFCmpUNO(b, const0); // NaN -> 0.0 - const auto sat_l = m_ir->CreateFCmpOLT(scaled, const1); // TODO ??? - const auto sat_h = m_ir->CreateFCmpOGE(scaled, ConstantVector::getSplat({4, false}, ConstantFP::get(GetType(), std::pow(2, 31)))); - const auto converted = m_ir->CreateFPToSI(m_ir->CreateSelect(sat_l, const1, scaled), GetType()); - SetVr(op.vd, m_ir->CreateSelect(sat_h, ConstantVector::getSplat({4, false}, m_ir->getInt32(0x7fffffff)), converted)); - SetSat(IsNotZero(m_ir->CreateOr(sat_l, sat_h))); + const auto b = get_vr(op.vb); + const auto scaled = b * fsplat(std::pow(2, 0 + op.vuimm)); + const auto const1 = fsplat(-std::pow(2, 31)); + //const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0 + const auto sat_l = fcmp_ord(scaled < const1); // TODO ??? + const auto sat_h = fcmp_ord(scaled >= fsplat(std::pow(2, 31))); + const auto converted = fpcast(select(sat_l, const1, scaled)); + set_vr(op.vd, select(sat_h, splat(0x7fff'ffff), converted)); + SetSat(IsNotZero(eval(sat_l | sat_h).value)); } void PPUTranslator::VCTUXS(ppu_opcode_t op) { - const auto b = GetVr(op.vb, VrType::vf); - const auto scaled = Scale(b, op.vuimm); - const auto const0 = ConstantVector::getSplat({4, false}, ConstantFP::get(GetType(), 0.0)); - //const auto is_nan = m_ir->CreateFCmpUNO(b, const0); // NaN -> 0.0 - const auto sat_l = m_ir->CreateFCmpOLT(scaled, const0); - const auto sat_h = m_ir->CreateFCmpOGE(scaled, ConstantVector::getSplat({4, false}, ConstantFP::get(GetType(), std::pow(2, 32)))); // TODO ??? - const auto converted = m_ir->CreateFPToUI(m_ir->CreateSelect(sat_l, const0, scaled), GetType()); - SetVr(op.vd, m_ir->CreateSelect(sat_h, ConstantVector::getSplat({4, false}, m_ir->getInt32(0xffffffff)), converted)); - SetSat(IsNotZero(m_ir->CreateOr(sat_l, sat_h))); + const auto b = get_vr(op.vb); + const auto scaled = b * fsplat(std::pow(2, 0 + op.vuimm)); + const auto const0 = fsplat(0.); + //const auto is_nan = fcmp_uno(b == b); // NaN -> 0.0 + const auto sat_l = fcmp_ord(scaled < const0); + const auto sat_h = fcmp_ord(scaled >= fsplat(std::pow(2, 32))); // TODO ??? + const auto converted = fpcast(select(sat_l, const0, scaled)); + set_vr(op.vd, select(sat_h, splat(0xffff'ffff), converted)); + SetSat(IsNotZero(eval(sat_l | sat_h).value)); } void PPUTranslator::VEXPTEFP(ppu_opcode_t op) @@ -1026,17 +980,17 @@ void PPUTranslator::VMADDFP(ppu_opcode_t op) if (m_use_fma) { - SetVr(op.vd, VecHandleResult(m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), { a.value, c.value, b.value }))); + set_vr(op.vd, vec_handle_result(fmuladd(a, c, b))); return; } - // Emulated FMA via double precision - const auto xa = m_ir->CreateFPExt(a.value, get_type()); - const auto xb = m_ir->CreateFPExt(b.value, get_type()); - const auto xc = m_ir->CreateFPExt(c.value, get_type()); + // Emulated FMA via double precision (caution: out-of-lane algorithm) + const auto xa = fpcast(a); + const auto xb = fpcast(b); + const auto xc = fpcast(c); - const auto xr = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fmuladd), {xa, xc, xb}); - SetVr(op.vd, VecHandleResult(m_ir->CreateFPTrunc(xr, get_type()))); + const auto xr = fmuladd(xa, xc, xb); + set_vr(op.vd, vec_handle_result(fpcast(xr))); } void PPUTranslator::VMAXFP(ppu_opcode_t op) @@ -1047,68 +1001,58 @@ void PPUTranslator::VMAXFP(ppu_opcode_t op) void PPUTranslator::VMAXSB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SLT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMAXSH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SLT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMAXSW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SLT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMAXUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_ULT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMAXUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_ULT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMAXUW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_ULT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, max(a, b)); } void PPUTranslator::VMHADDSHS(ppu_opcode_t op) { - const auto abc = SExt(GetVrs(VrType::vi16, op.va, op.vb, op.vc)); - const auto result = m_ir->CreateAdd(m_ir->CreateAShr(m_ir->CreateMul(abc[0], abc[1]), 15), abc[2]); - const auto saturated = SaturateSigned(result, -0x8000, 0x7fff); - SetVr(op.vd, saturated.first); - SetSat(IsNotZero(saturated.second)); - - // const auto a = get_vr(op.va); - // const auto b = get_vr(op.vb); - // const auto c = get_vr(op.vc); - // value_t m; - // m.value = m_ir->CreateShl(Trunc(m_ir->CreateAShr(m_ir->CreateMul(SExt(a.value), SExt(b.value)), 16)), 1); - // m.value = m_ir->CreateOr(m.value, m_ir->CreateLShr(m_ir->CreateMul(a.value, b.value), 15)); - // const auto s = eval(c + m); - // const auto z = eval((c >> 15) ^ 0x7fff); - // const auto x = eval(((m ^ s) & ~(c ^ m)) >> 15); - // set_vr(op.vd, eval((z & x) | (s & ~x))); - //SetSat(IsNotZero(saturated.second)); + // Caution: out-of-lane algorithm + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto m = ((sext(a) * sext(b)) >> 15) + sext(c); + const auto r = trunc(min(max(m, splat(-0x8000)), splat(0x7fff))); + set_vr(op.vd, r); + SetSat(IsNotZero(eval((m + 0x8000) >> 16).value)); } void PPUTranslator::VMHRADDSHS(ppu_opcode_t op) { - const auto abc = SExt(GetVrs(VrType::vi16, op.va, op.vb, op.vc)); - const auto result = m_ir->CreateAdd(m_ir->CreateAShr(m_ir->CreateAdd(m_ir->CreateMul(abc[0], abc[1]), ConstantVector::getSplat({8, false}, m_ir->getInt32(0x4000))), 15), abc[2]); - const auto saturated = SaturateSigned(result, -0x8000, 0x7fff); - SetVr(op.vd, saturated.first); - SetSat(IsNotZero(saturated.second)); + // Caution: out-of-lane algorithm + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto m = ((sext(a) * sext(b) + splat(0x4000)) >> 15) + sext(c); + const auto r = trunc(min(max(m, splat(-0x8000)), splat(0x7fff))); + set_vr(op.vd, r); + SetSat(IsNotZero(eval((m + 0x8000) >> 16).value)); } void PPUTranslator::VMINFP(ppu_opcode_t op) @@ -1119,80 +1063,80 @@ void PPUTranslator::VMINFP(ppu_opcode_t op) void PPUTranslator::VMINSB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMINSH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMINSW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_SGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMINUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_UGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMINUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_UGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMINUW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Saturate(ab[0], CmpInst::ICMP_UGT, ab[1]).first); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, min(a, b)); } void PPUTranslator::VMLADDUHM(ppu_opcode_t op) { - const auto abc = GetVrs(VrType::vi16, op.va, op.vb, op.vc); - SetVr(op.vd, m_ir->CreateAdd(m_ir->CreateMul(abc[0], abc[1]), abc[2])); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + set_vr(op.vd, a * b + c); } void PPUTranslator::VMRGHB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15)); } void PPUTranslator::VMRGHH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 0, 8, 1, 9, 2, 10, 3, 11 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 12, 4, 13, 5, 14, 6, 15, 7)); } void PPUTranslator::VMRGHW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 0, 4, 1, 5 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 6, 2, 7, 3)); } void PPUTranslator::VMRGLB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi8, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7)); } void PPUTranslator::VMRGLH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 4, 12, 5, 13, 6, 14, 7, 15 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 8, 0, 9, 1, 10, 2, 11, 3)); } void PPUTranslator::VMRGLW(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, Shuffle(ab[0], ab[1], { 2, 6, 3, 7 })); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, shuffle2(a, b, 4, 0, 5, 1)); } void PPUTranslator::VMSUMMBM(ppu_opcode_t op) @@ -1202,26 +1146,22 @@ void PPUTranslator::VMSUMMBM(ppu_opcode_t op) const auto c = get_vr(op.vc); const auto ml = bitcast((a << 8 >> 8) * noncast(b << 8 >> 8)); const auto mh = bitcast((a >> 8) * noncast(b >> 8)); - set_vr(op.vd, eval(((ml << 16 >> 16) + (ml >> 16)) + ((mh << 16 >> 16) + (mh >> 16)) + c)); + set_vr(op.vd, ((ml << 16 >> 16) + (ml >> 16)) + ((mh << 16 >> 16) + (mh >> 16)) + c); } void PPUTranslator::VMSUMSHM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - const auto c = get_vr(op.vc); - const auto ml = eval((a << 16 >> 16) * (b << 16 >> 16)); - const auto mh = eval((a >> 16) * (b >> 16)); - set_vr(op.vd, eval(ml + mh + c)); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto ml = (a << 16 >> 16) * (b << 16 >> 16); + const auto mh = (a >> 16) * (b >> 16); + set_vr(op.vd, ml + mh + c); } void PPUTranslator::VMSUMSHS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - const auto c = get_vr(op.vc); - const auto ml = eval((a << 16 >> 16) * (b << 16 >> 16)); - const auto mh = eval((a >> 16) * (b >> 16)); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto ml = (a << 16 >> 16) * (b << 16 >> 16); + const auto mh = (a >> 16) * (b >> 16); const auto m = eval(ml + mh); const auto s = eval(m + c); const auto z = eval((c >> 31) ^ 0x7fffffff); @@ -1233,8 +1173,7 @@ void PPUTranslator::VMSUMSHS(ppu_opcode_t op) void PPUTranslator::VMSUMUBM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); + const auto [a, b] = get_vrs(op.va, op.vb); const auto c = get_vr(op.vc); const auto ml = bitcast((a << 8 >> 8) * (b << 8 >> 8)); const auto mh = bitcast((a >> 8) * (b >> 8)); @@ -1243,21 +1182,17 @@ void PPUTranslator::VMSUMUBM(ppu_opcode_t op) void PPUTranslator::VMSUMUHM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - const auto c = get_vr(op.vc); - const auto ml = eval((a << 16 >> 16) * (b << 16 >> 16)); - const auto mh = eval((a >> 16) * (b >> 16)); - set_vr(op.vd, eval(ml + mh + c)); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto ml = (a << 16 >> 16) * (b << 16 >> 16); + const auto mh = (a >> 16) * (b >> 16); + set_vr(op.vd, ml + mh + c); } void PPUTranslator::VMSUMUHS(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - const auto c = get_vr(op.vc); - const auto ml = noncast((a << 16 >> 16) * (b << 16 >> 16)); - const auto mh = noncast((a >> 16) * (b >> 16)); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); + const auto ml = (a << 16 >> 16) * (b << 16 >> 16); + const auto mh = (a >> 16) * (b >> 16); const auto s = eval(ml + mh); const auto s2 = eval(s + c); const auto x = eval((s < ml) | (s2 < s)); @@ -1267,50 +1202,50 @@ void PPUTranslator::VMSUMUHS(ppu_opcode_t op) void PPUTranslator::VMULESB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateAShr(ab[0], 8), m_ir->CreateAShr(ab[1], 8))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a >> 8) * (b >> 8)); } void PPUTranslator::VMULESH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateAShr(ab[0], 16), m_ir->CreateAShr(ab[1], 16))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a >> 16) * (b >> 16)); } void PPUTranslator::VMULEUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateLShr(ab[0], 8), m_ir->CreateLShr(ab[1], 8))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a >> 8) * (b >> 8)); } void PPUTranslator::VMULEUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateLShr(ab[0], 16), m_ir->CreateLShr(ab[1], 16))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a >> 16) * (b >> 16)); } void PPUTranslator::VMULOSB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateAShr(m_ir->CreateShl(ab[0], 8), 8), m_ir->CreateAShr(m_ir->CreateShl(ab[1], 8), 8))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a << 8 >> 8) * (b << 8 >> 8)); } void PPUTranslator::VMULOSH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateAShr(m_ir->CreateShl(ab[0], 16), 16), m_ir->CreateAShr(m_ir->CreateShl(ab[1], 16), 16))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a << 16 >> 16) * (b << 16 >> 16)); } void PPUTranslator::VMULOUB(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi16, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateLShr(m_ir->CreateShl(ab[0], 8), 8), m_ir->CreateLShr(m_ir->CreateShl(ab[1], 8), 8))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a << 8 >> 8) * (b << 8 >> 8)); } void PPUTranslator::VMULOUH(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - SetVr(op.vd, m_ir->CreateMul(m_ir->CreateLShr(m_ir->CreateShl(ab[0], 16), 16), m_ir->CreateLShr(m_ir->CreateShl(ab[1], 16), 16))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, (a << 16 >> 16) * (b << 16 >> 16)); } void PPUTranslator::VNMSUBFP(ppu_opcode_t op) @@ -1338,40 +1273,42 @@ void PPUTranslator::VNMSUBFP(ppu_opcode_t op) // Differs from the emulated path with regards to negative zero if (m_use_fma) { - SetVr(op.vd, VecHandleResult(m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), { m_ir->CreateFNeg(a.value), c.value, b.value }))); + set_vr(op.vd, vec_handle_result(fmuladd(-a, c, b))); return; } - // Emulated FMA via double precision - const auto xa = m_ir->CreateFPExt(a.value, get_type()); - const auto xb = m_ir->CreateFPExt(b.value, get_type()); - const auto xc = m_ir->CreateFPExt(c.value, get_type()); + // Emulated FMA via double precision (caution: out-of-lane algorithm) + const auto xa = fpcast(a); + const auto xb = fpcast(b); + const auto xc = fpcast(c); - const auto xr = m_ir->CreateFNeg(m_ir->CreateFSub(m_ir->CreateFMul(xa, xc), xb)); - SetVr(op.vd, VecHandleResult(m_ir->CreateFPTrunc(xr, get_type()))); + const auto nr = xa * xc - xb; + set_vr(op.vd, vec_handle_result(fpcast(-nr))); } void PPUTranslator::VNOR(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(~(a | b))); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, ~(a | b)); } void PPUTranslator::VOR(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - set_vr(op.vd, eval(a | b)); + const auto [a, b] = get_vrs(op.va, op.vb); + set_vr(op.vd, a | b); } void PPUTranslator::VPERM(ppu_opcode_t op) { - const auto a = get_vr(op.va); - const auto b = get_vr(op.vb); - const auto c = get_vr(op.vc); + const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); - if (m_use_avx512_icl && op.ra != op.rb) + if (op.ra == op.rb) + { + set_vr(op.vd, pshufb(a, ~c & 0xf)); + return; + } + + if (m_use_avx512_icl) { const auto i = eval(~c); set_vr(op.vd, vperm2b256to128(b, a, i)); @@ -1384,12 +1321,13 @@ void PPUTranslator::VPERM(ppu_opcode_t op) void PPUTranslator::VPKPX(ppu_opcode_t op) { - const auto ab = GetVrs(VrType::vi32, op.va, op.vb); - const auto px = Shuffle(ab[0], ab[1], { 0, 1, 2, 3, 4, 5, 6, 7 }); - const auto e1 = m_ir->CreateLShr(m_ir->CreateAnd(px, 0x01f80000), 9); - const auto e2 = m_ir->CreateLShr(m_ir->CreateAnd(px, 0xf800), 6); - const auto e3 = m_ir->CreateLShr(m_ir->CreateAnd(px, 0xf8), 3); - SetVr(op.vd, m_ir->CreateOr(m_ir->CreateOr(e1, e2), e3)); + // Caution: out-of-lane algorithm + const auto [a, b] = get_vrs(op.va, op.vb); + const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7); + const auto e1 = (ab & 0x01f80000) >> 9; + const auto e2 = (ab & 0xf800) >> 6; + const auto e3 = (ab & 0xf8) >> 3; + set_vr(op.vd, trunc(e1 | e2 | e3)); } void PPUTranslator::VPKSHSS(ppu_opcode_t op) @@ -4703,6 +4641,7 @@ void PPUTranslator::SetVr(u32 vr, Value* value) } } + ensure(value->getType()->getPrimitiveSizeInBits() == 128); RegStore(value, m_vr[vr]); } diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index f5a14115cb..605ec97b41 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -199,9 +199,6 @@ public: // Saturate signed value (second result is the disjunction of comparison results) std::pair SaturateSigned(llvm::Value* value, u64 min, u64 max); - // Multiply FP value or vector by the pow(2, scale) - llvm::Value* Scale(llvm::Value* value, s32 scale); - // Create shuffle instruction with constant args llvm::Value* Shuffle(llvm::Value* left, llvm::Value* right, std::initializer_list indices);