PPU LLVM: rewrite and optimize saturation bit

Use vector accumulator
This commit is contained in:
Nekotekina 2021-11-26 20:01:29 +03:00
parent 209b14fbac
commit e3e39e8de3
5 changed files with 110 additions and 129 deletions

View File

@ -528,14 +528,14 @@ inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)
bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat != v128{}} | (u32{ppu.nj} << 16));
return true;
}
bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
{
const u32 vscr = ppu.vr[op.vb]._u32[3];
ppu.sat = (vscr & 1) != 0;
ppu.sat = v128::from32((vscr & 1) != 0);
ppu.nj = (vscr & 0x10000) != 0;
ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
return true;
@ -577,12 +577,12 @@ bool ppu_interpreter_precise::VADDSBS(ppu_thread& ppu, ppu_opcode_t op)
if (sum < INT8_MIN)
{
d._s8[i] = INT8_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum > INT8_MAX)
{
d._s8[i] = INT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -612,12 +612,12 @@ bool ppu_interpreter_precise::VADDSHS(ppu_thread& ppu, ppu_opcode_t op)
if (sum < INT16_MIN)
{
d._s16[i] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum > INT16_MAX)
{
d._s16[i] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -654,12 +654,12 @@ bool ppu_interpreter_precise::VADDSWS(ppu_thread& ppu, ppu_opcode_t op)
if (sum < INT32_MIN)
{
d._s32[i] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum > INT32_MAX)
{
d._s32[i] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -695,7 +695,7 @@ bool ppu_interpreter_precise::VADDUBS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > UINT8_MAX)
{
d._u8[i] = UINT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -731,7 +731,7 @@ bool ppu_interpreter_precise::VADDUHS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > UINT16_MAX)
{
d._u16[i] = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -770,7 +770,7 @@ bool ppu_interpreter_precise::VADDUWS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > UINT32_MAX)
{
d._u32[i] = UINT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -988,13 +988,13 @@ bool ppu_interpreter_precise::VCTSXS(ppu_thread& ppu, ppu_opcode_t op)
}
else
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
}
}
else if (exp2 > 30)
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
d._s32[i] = sign ? 0x80000000 : 0x7FFFFFFF;
}
else if (exp2 < 0)
@ -1041,13 +1041,13 @@ bool ppu_interpreter_precise::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
}
else
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
d._u32[i] = sign ? 0 : 0xFFFFFFFF;
}
}
else if (exp2 > 31)
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
d._u32[i] = sign ? 0 : 0xFFFFFFFF;
}
else if (exp2 < 0)
@ -1056,7 +1056,7 @@ bool ppu_interpreter_precise::VCTUXS(ppu_thread& ppu, ppu_opcode_t op)
}
else if (sign)
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
d._u32[i] = 0;
}
else
@ -1180,12 +1180,12 @@ bool ppu_interpreter_precise::VMHADDSHS(ppu_thread& ppu, ppu_opcode_t op)
if (sum < INT16_MIN)
{
d._s16[i] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum > INT16_MAX)
{
d._s16[i] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1229,12 +1229,12 @@ bool ppu_interpreter_precise::VMHRADDSHS(ppu_thread& ppu, ppu_opcode_t op)
if (sum < INT16_MIN)
{
d._s16[i] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum > INT16_MAX)
{
d._s16[i] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1420,12 +1420,12 @@ bool ppu_interpreter_precise::VMSUMSHS(ppu_thread& ppu, ppu_opcode_t op)
if (result > 0x7fffffff)
{
saturated = 0x7fffffff;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result < INT32_MIN)
{
saturated = 0x80000000;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
saturated = static_cast<s32>(result);
@ -1517,7 +1517,7 @@ bool ppu_interpreter_precise::VMSUMUHS(ppu_thread& ppu, ppu_opcode_t op)
if (result > 0xffffffffu)
{
saturated = 0xffffffff;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
saturated = static_cast<u32>(result);
@ -1666,12 +1666,12 @@ bool ppu_interpreter_precise::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
if (result < INT8_MIN)
{
d._s8[i + 8] = INT8_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result > INT8_MAX)
{
d._s8[i + 8] = INT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1683,12 +1683,12 @@ bool ppu_interpreter_precise::VPKSHSS(ppu_thread& ppu, ppu_opcode_t op)
if (result < INT8_MIN)
{
d._s8[i] = INT8_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result > INT8_MAX)
{
d._s8[i] = INT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1717,7 +1717,7 @@ bool ppu_interpreter_precise::VPKSHUS(ppu_thread& ppu, ppu_opcode_t op)
const auto all_bits = a | b;
if ((all_bits._u64[0] | all_bits._u64[1]) & mask)
{
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
}
@ -1744,12 +1744,12 @@ bool ppu_interpreter_precise::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
if (result < INT16_MIN)
{
d._s16[i + 4] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result > INT16_MAX)
{
d._s16[i + 4] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1761,12 +1761,12 @@ bool ppu_interpreter_precise::VPKSWSS(ppu_thread& ppu, ppu_opcode_t op)
if (result < INT16_MIN)
{
d._s16[i] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result > INT16_MAX)
{
d._s16[i] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -1828,12 +1828,12 @@ bool ppu_interpreter_precise::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT16_MAX)
{
result = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result < 0)
{
result = 0;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u16[h + 4] = result;
@ -1843,12 +1843,12 @@ bool ppu_interpreter_precise::VPKSWUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT16_MAX)
{
result = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result < 0)
{
result = 0;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u16[h] = result;
@ -1909,7 +1909,7 @@ bool ppu_interpreter_precise::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT8_MAX)
{
result = UINT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u8[b + 8] = static_cast<u8>(result);
@ -1919,7 +1919,7 @@ bool ppu_interpreter_precise::VPKUHUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT8_MAX)
{
result = UINT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u8[b] = static_cast<u8>(result);
@ -1980,7 +1980,7 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT16_MAX)
{
result = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u16[h + 4] = result;
@ -1990,7 +1990,7 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
if (result > UINT16_MAX)
{
result = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
d._u16[h] = result;
@ -2430,12 +2430,12 @@ bool ppu_interpreter_precise::VSUBSBS(ppu_thread& ppu, ppu_opcode_t op)
if (diff < INT8_MIN)
{
d._s8[i] = INT8_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (diff > INT8_MAX)
{
d._s8[i] = INT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -2465,12 +2465,12 @@ bool ppu_interpreter_precise::VSUBSHS(ppu_thread& ppu, ppu_opcode_t op)
if (diff < INT16_MIN)
{
d._s16[i] = INT16_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (diff > INT16_MAX)
{
d._s16[i] = INT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -2518,12 +2518,12 @@ bool ppu_interpreter_precise::VSUBSWS(ppu_thread& ppu, ppu_opcode_t op)
if (result < INT32_MIN)
{
d._s32[w] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (result > INT32_MAX)
{
d._s32[w] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._s32[w] = static_cast<s32>(result);
@ -2556,12 +2556,12 @@ bool ppu_interpreter_precise::VSUBUBS(ppu_thread& ppu, ppu_opcode_t op)
if (diff < 0)
{
d._u8[i] = 0;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (diff > UINT8_MAX)
{
d._u8[i] = UINT8_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -2597,12 +2597,12 @@ bool ppu_interpreter_precise::VSUBUHS(ppu_thread& ppu, ppu_opcode_t op)
if (diff < 0)
{
d._u16[i] = 0;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (diff > UINT16_MAX)
{
d._u16[i] = UINT16_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
{
@ -2652,7 +2652,7 @@ bool ppu_interpreter_precise::VSUBUWS(ppu_thread& ppu, ppu_opcode_t op)
if (result < 0)
{
d._u32[w] = 0;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._u32[w] = static_cast<u32>(result);
@ -2704,12 +2704,12 @@ bool ppu_interpreter_precise::VSUMSWS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > INT32_MAX)
{
d._s32[0] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum < INT32_MIN)
{
d._s32[0] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._s32[0] = static_cast<s32>(sum);
@ -2756,12 +2756,12 @@ bool ppu_interpreter_precise::VSUM2SWS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > INT32_MAX)
{
d._s32[n * 2] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum < INT32_MIN)
{
d._s32[n * 2] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._s32[n * 2] = static_cast<s32>(sum);
@ -2820,12 +2820,12 @@ bool ppu_interpreter_precise::VSUM4SBS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > INT32_MAX)
{
d._s32[w] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum < INT32_MIN)
{
d._s32[w] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._s32[w] = static_cast<s32>(sum);
@ -2880,12 +2880,12 @@ bool ppu_interpreter_precise::VSUM4SHS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > INT32_MAX)
{
d._s32[w] = INT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else if (sum < INT32_MIN)
{
d._s32[w] = INT32_MIN;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._s32[w] = static_cast<s32>(sum);
@ -2936,7 +2936,7 @@ bool ppu_interpreter_precise::VSUM4UBS(ppu_thread& ppu, ppu_opcode_t op)
if (sum > UINT32_MAX)
{
d._u32[w] = UINT32_MAX;
ppu.sat = true;
ppu.sat._u32[0] = 1;
}
else
d._u32[w] = static_cast<u32>(sum);

View File

@ -65,8 +65,6 @@
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
const bool s_use_ssse3 = utils::has_ssse3();
extern atomic_t<u64> g_watchdog_hold_ctr;
// Should be of the same type

View File

@ -222,27 +222,6 @@ public:
}
xer;
/*
Saturation. A sticky status bit indicating that some field in a saturating instruction saturated since the last
time SAT was cleared. In other words when SAT = '1' it remains set to '1' until it is cleared to '0' by an
mtvscr instruction.
1 The vector saturate instruction implicitly sets when saturation has occurred on the results one of
the vector instructions having saturate in its name:
Move To VSCR (mtvscr)
Vector Add Integer with Saturation (vaddubs, vadduhs, vadduws, vaddsbs, vaddshs,
vaddsws)
Vector Subtract Integer with Saturation (vsububs, vsubuhs, vsubuws, vsubsbs, vsubshs,
vsubsws)
Vector Multiply-Add Integer with Saturation (vmhaddshs, vmhraddshs)
Vector Multiply-Sum with Saturation (vmsumuhs, vmsumshs, vsumsws)
Vector Sum-Across with Saturation (vsumsws, vsum2sws, vsum4sbs, vsum4shs,
vsum4ubs)
Vector Pack with Saturation (vpkuhus, vpkuwus, vpkshus, vpkswus, vpkshss, vpkswss)
Vector Convert to Fixed-Point with Saturation (vctuxs, vctsxs)
0 Indicates no saturation occurred; mtvscr can explicitly clear this bit.
*/
bool sat{};
/*
Non-Java. A mode control bit that determines whether vector floating-point operations will be performed
in a Java-IEEE-C9X-compliant mode or a possibly faster non-Java/non-IEEE mode.
@ -255,6 +234,9 @@ public:
*/
bool nj = true;
// Sticky saturation bit
v128 sat{};
// Optimization: precomputed java-mode mask for handling denormals
u32 jm_mask = 0x7f80'0000;

View File

@ -42,8 +42,9 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
thread_struct.insert(thread_struct.end(), 2, GetType<u32>()); // vrsave, cia
thread_struct.insert(thread_struct.end(), 3, GetType<bool>()); // so, ov, ca
thread_struct.insert(thread_struct.end(), 1, GetType<u8>()); // cnt
thread_struct.insert(thread_struct.end(), 2, GetType<bool>()); // sat, nj
thread_struct.emplace_back(ArrayType::get(GetType<char>(), 2)); // Padding
thread_struct.insert(thread_struct.end(), 1, GetType<bool>()); // nj
thread_struct.emplace_back(ArrayType::get(GetType<char>(), 3)); // Padding
thread_struct.insert(thread_struct.end(), 1, GetType<u32[4]>()); // sat
thread_struct.insert(thread_struct.end(), 1, GetType<u32>()); // jm_mask
m_thread_type = StructType::create(m_context, thread_struct, "context_t");
@ -639,7 +640,7 @@ void PPUTranslator::CompilationError(const std::string& error)
void PPUTranslator::MFVSCR(ppu_opcode_t op)
{
const auto vscr = m_ir->CreateOr(ZExt(RegLoad(m_sat), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
const auto vscr = m_ir->CreateOr(ZExt(IsNotZero(RegLoad(m_sat)), GetType<u32>()), m_ir->CreateShl(ZExt(RegLoad(m_nj), GetType<u32>()), 16));
SetVr(op.vd, m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), vscr, m_ir->getInt32(m_is_be ? 3 : 0)));
}
@ -649,7 +650,7 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op)
const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType<bool>());
RegStore(nj, m_nj);
if (g_cfg.core.llvm_ppu_jm_handling) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask);
RegStore(Trunc(vscr, GetType<bool>()), m_sat);
RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType<u32[4]>()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat);
}
void PPUTranslator::VADDCUW(ppu_opcode_t op)
@ -669,7 +670,7 @@ void PPUTranslator::VADDSBS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s8[16]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VADDSHS(ppu_opcode_t op)
@ -677,7 +678,7 @@ void PPUTranslator::VADDSHS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s16[8]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VADDSWS(ppu_opcode_t op)
@ -685,7 +686,7 @@ void PPUTranslator::VADDSWS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s32[4]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VADDUBM(ppu_opcode_t op)
@ -699,7 +700,7 @@ void PPUTranslator::VADDUBS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u8[16]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VADDUHM(ppu_opcode_t op)
@ -713,7 +714,7 @@ void PPUTranslator::VADDUHS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u16[8]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VADDUWM(ppu_opcode_t op)
@ -727,7 +728,7 @@ void PPUTranslator::VADDUWS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u32[4]>(op.va, op.vb);
const auto r = add_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a + b)).value));
set_sat(r ^ (a + b));
}
void PPUTranslator::VAND(ppu_opcode_t op)
@ -906,7 +907,7 @@ void PPUTranslator::VCTSXS(ppu_opcode_t op)
const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 31)));
const auto converted = fpcast<s32[4]>(select(sat_l, const1, scaled));
set_vr(op.vd, select(sat_h, splat<s32[4]>(0x7fff'ffff), converted));
SetSat(IsNotZero(eval(sat_l | sat_h).value));
set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
}
void PPUTranslator::VCTUXS(ppu_opcode_t op)
@ -919,7 +920,7 @@ void PPUTranslator::VCTUXS(ppu_opcode_t op)
const auto sat_h = fcmp_ord(scaled >= fsplat<f32[4]>(std::pow(2, 32))); // TODO ???
const auto converted = fpcast<u32[4]>(select(sat_l, const0, scaled));
set_vr(op.vd, select(sat_h, splat<u32[4]>(0xffff'ffff), converted));
SetSat(IsNotZero(eval(sat_l | sat_h).value));
set_sat(sext<s32[4]>(sat_l) | sext<s32[4]>(sat_h));
}
void PPUTranslator::VEXPTEFP(ppu_opcode_t op)
@ -1038,7 +1039,7 @@ void PPUTranslator::VMHADDSHS(ppu_opcode_t op)
const auto m = ((sext<s32[8]>(a) * sext<s32[8]>(b)) >> 15) + sext<s32[8]>(c);
const auto r = trunc<u16[8]>(min(max(m, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval((m + 0x8000) >> 16).value));
set_sat(trunc<u16[8]>((m + 0x8000) >> 16));
}
void PPUTranslator::VMHRADDSHS(ppu_opcode_t op)
@ -1048,7 +1049,7 @@ void PPUTranslator::VMHRADDSHS(ppu_opcode_t op)
const auto m = ((sext<s32[8]>(a) * sext<s32[8]>(b) + splat<s32[8]>(0x4000)) >> 15) + sext<s32[8]>(c);
const auto r = trunc<u16[8]>(min(max(m, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval((m + 0x8000) >> 16).value));
set_sat(trunc<u16[8]>((m + 0x8000) >> 16));
}
void PPUTranslator::VMINFP(ppu_opcode_t op)
@ -1164,7 +1165,7 @@ void PPUTranslator::VMSUMSHS(ppu_opcode_t op)
const auto mx = eval(m ^ sext<s32[4]>(m == 0x80000000u));
const auto x = eval(((mx ^ s) & ~(c ^ mx)) >> 31);
set_vr(op.vd, eval((z & x) | (s & ~x)));
SetSat(IsNotZero(x.value));
set_sat(x);
}
void PPUTranslator::VMSUMUBM(ppu_opcode_t op)
@ -1193,7 +1194,7 @@ void PPUTranslator::VMSUMUHS(ppu_opcode_t op)
const auto s2 = eval(s + c);
const auto x = eval((s < ml) | (s2 < s));
set_vr(op.vd, select(x, splat<u32[4]>(-1), s2));
SetSat(IsNotZero(x.value));
set_sat(x);
}
void PPUTranslator::VMULESB(ppu_opcode_t op)
@ -1333,7 +1334,7 @@ void PPUTranslator::VPKSHSS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(-0x80)), splat<s16[16]>(0x7f)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval(((a + 0x80) | (b + 0x80)) >> 8).value));
set_sat(((a + 0x80) | (b + 0x80)) >> 8);
}
void PPUTranslator::VPKSHUS(ppu_opcode_t op)
@ -1343,7 +1344,7 @@ void PPUTranslator::VPKSHUS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
const auto r = trunc<u8[16]>(min(max(ab, splat<s16[16]>(0)), splat<s16[16]>(0xff)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval((a | b) >> 8).value));
set_sat((a | b) >> 8);
}
void PPUTranslator::VPKSWSS(ppu_opcode_t op)
@ -1353,7 +1354,7 @@ void PPUTranslator::VPKSWSS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(-0x8000)), splat<s32[8]>(0x7fff)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval(((a + 0x8000) | (b + 0x8000)) >> 16).value));
set_sat(((a + 0x8000) | (b + 0x8000)) >> 16);
}
void PPUTranslator::VPKSWUS(ppu_opcode_t op)
@ -1363,7 +1364,7 @@ void PPUTranslator::VPKSWUS(ppu_opcode_t op)
const auto ab = shuffle2(b, a, 0, 1, 2, 3, 4, 5, 6, 7);
const auto r = trunc<u16[8]>(min(max(ab, splat<s32[8]>(0)), splat<s32[8]>(0xffff)));
set_vr(op.vd, r);
SetSat(IsNotZero(eval((a | b) >> 16).value));
set_sat((a | b) >> 16);
}
void PPUTranslator::VPKUHUM(ppu_opcode_t op)
@ -1380,7 +1381,7 @@ void PPUTranslator::VPKUHUS(ppu_opcode_t op)
const auto tb = bitcast<u8[16]>(min(b, splat<u16[8]>(0xff)));
const auto r = shuffle2(tb, ta, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
set_vr(op.vd, r);
SetSat(IsNotZero(eval((a | b) >> 8).value));
set_sat((a | b) >> 8);
}
void PPUTranslator::VPKUWUM(ppu_opcode_t op)
@ -1397,7 +1398,7 @@ void PPUTranslator::VPKUWUS(ppu_opcode_t op)
const auto tb = bitcast<u16[8]>(min(b, splat<u32[4]>(0xffff)));
const auto r = shuffle2(tb, ta, 0, 2, 4, 6, 8, 10, 12, 14);
set_vr(op.vd, r);
SetSat(IsNotZero(eval((a | b) >> 16).value));
set_sat((a | b) >> 16);
}
void PPUTranslator::VREFP(ppu_opcode_t op)
@ -1670,7 +1671,7 @@ void PPUTranslator::VSUBSBS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s8[16]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUBSHS(ppu_opcode_t op)
@ -1678,7 +1679,7 @@ void PPUTranslator::VSUBSHS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s16[8]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUBSWS(ppu_opcode_t op)
@ -1686,7 +1687,7 @@ void PPUTranslator::VSUBSWS(ppu_opcode_t op)
const auto [a, b] = get_vrs<s32[4]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUBUBM(ppu_opcode_t op)
@ -1700,7 +1701,7 @@ void PPUTranslator::VSUBUBS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u8[16]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUBUHM(ppu_opcode_t op)
@ -1714,7 +1715,7 @@ void PPUTranslator::VSUBUHS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u16[8]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUBUWM(ppu_opcode_t op)
@ -1728,7 +1729,7 @@ void PPUTranslator::VSUBUWS(ppu_opcode_t op)
const auto [a, b] = get_vrs<u32[4]>(op.va, op.vb);
const auto r = sub_sat(a, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (a - b)).value));
set_sat(r ^ (a - b));
}
void PPUTranslator::VSUMSWS(ppu_opcode_t op)
@ -1740,7 +1741,7 @@ void PPUTranslator::VSUMSWS(ppu_opcode_t op)
const auto s = eval(x + y + z);
const auto r = min(max(zshuffle(s, 0, 2) + zshuffle(s, 1, 2), splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 4, 4));
SetSat(IsNotZero(eval((r + 0x8000'0000) >> 32).value));
set_sat((r + 0x8000'0000) >> 32);
}
void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
@ -1751,7 +1752,7 @@ void PPUTranslator::VSUM2SWS(ppu_opcode_t op)
const auto z = b >> 32;
const auto r = min(max(x + y + z, splat<s64[2]>(-0x8000'0000ll)), splat<s64[2]>(0x7fff'ffff));
set_vr(op.vd, zshuffle(bitcast<u32[4]>(r), 0, 4, 2, 4));
SetSat(IsNotZero(eval((r + 0x8000'0000) >> 32).value));
set_sat((r + 0x8000'0000) >> 32);
}
void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
@ -1765,7 +1766,7 @@ void PPUTranslator::VSUM4SBS(ppu_opcode_t op)
const auto s = eval(x + y + z + w); // Can't overflow
const auto r = add_sat(s, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (s + b)).value));
set_sat(r ^ (s + b));
}
void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
@ -1777,7 +1778,7 @@ void PPUTranslator::VSUM4SHS(ppu_opcode_t op)
const auto s = eval(x + y); // Can't overflow
const auto r = add_sat(s, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (s + b)).value));
set_sat(r ^ (s + b));
}
void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
@ -1791,7 +1792,7 @@ void PPUTranslator::VSUM4UBS(ppu_opcode_t op)
const auto s = eval(x + y + z + w); // Can't overflow
const auto r = add_sat(s, b);
set_vr(op.vd, r);
SetSat(IsNotZero(eval(r != (s + b)).value));
set_sat(r ^ (s + b));
}
#define UNPACK_PIXEL_OP(px) (px & 0xff00001f) | ((px << 6) & 0x1f0000) | ((px << 3) & 0x1f00)
@ -4813,14 +4814,6 @@ void PPUTranslator::SetOverflow(Value* bit)
RegStore(m_ir->CreateOr(RegLoad(m_so), bit), m_so);
}
void PPUTranslator::SetSat(Value* bit)
{
if (m_attr & ppu_attr::has_mfvscr)
{
RegStore(m_ir->CreateOr(RegLoad(m_sat), bit), m_sat);
}
}
Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)
{
Value* trap_condition = m_ir->getFalse();

View File

@ -79,8 +79,8 @@ class PPUTranslator final : public cpu_translator
DEF_VALUE(m_ov, m_g_ov, 168) // XER.OV bit, overflow flag
DEF_VALUE(m_ca, m_g_ca, 169) // XER.CA bit, carry flag
DEF_VALUE(m_cnt, m_g_cnt, 170) // XER.CNT
DEF_VALUE(m_sat, m_g_sat, 171) // VSCR.SAT bit, sticky saturation flag
DEF_VALUE(m_nj, m_g_nj, 172) // VSCR.NJ bit, non-Java mode
DEF_VALUE(m_nj, m_g_nj, 171) // VSCR.NJ bit, non-Java mode
DEF_VALUE(m_sat, m_g_sat, 173) // VSCR.SAT bit, sticky saturation flag
DEF_VALUE(m_jm_mask, m_g_jm_mask, 174) // Java-Mode helper mask
#undef DEF_VALUE
@ -118,6 +118,17 @@ public:
return result;
}
// Update sticky VSCR.SAT bit (|=)
template <typename T>
void set_sat(T&& expr)
{
if (m_attr & ppu_attr::has_mfvscr)
{
const auto val = expr.eval(m_ir);
RegStore(m_ir->CreateOr(m_ir->CreateBitCast(RegLoad(m_sat), val->getType()), val), m_sat);
}
}
// Get current instruction address
llvm::Value* GetAddr(u64 _add = 0);
@ -265,9 +276,6 @@ public:
// Set XER.OV bit, and update XER.SO bit (|=)
void SetOverflow(llvm::Value*);
// Update sticky VSCR.SAT bit (|=)
void SetSat(llvm::Value*);
// Check condition for trap instructions
llvm::Value* CheckTrapCondition(u32 to, llvm::Value* left, llvm::Value* right);