PPU Interpreter: Accurate vector instruction NaNs

Tested with https://github.com/RPCS3/ps3autotests/tree/master/tests/cpu/ppu_vpu.
This commit gets us from 2746 to 353 different lines compared to realhw.
This commit is contained in:
Nick Renieris 2020-05-03 09:32:10 +03:00 committed by Ani
parent 78ac2a86bb
commit 20d8d38e53

View File

@ -8,8 +8,9 @@
#include "Utilities/sysinfo.h"
#include "Emu/Cell/Common.h"
#include <cmath>
#include <atomic>
#include <bit>
#include <cmath>
#if !defined(_MSC_VER) && defined(__clang__)
#pragma GCC diagnostic push
@ -358,6 +359,33 @@ public:
}
const g_ppu_scale_table;
constexpr u32 ppu_nan_u32 = 0x7FC00000u;
const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
template<typename... Args>
v128 vec_handle_nan(v128 result, Args... args)
{
const auto is_nan = v128::fromF(_mm_cmpunord_ps(result.vf, result.vf));
const auto nans = v128::from32p(ppu_nan_u32);
result = (nans & is_nan) | v128::andnot(is_nan, result);
return result;
}
template<typename... Args>
v128 vec_handle_nan(__m128 result, Args... args)
{
return vec_handle_nan(v128::fromF(result));
}
template<typename... Args>
float f32_handle_nan(float result, Args... args)
{
if (std::isnan(result))
result = ppu_nan_f32;
return result;
}
bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
@ -382,7 +410,10 @@ bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = v128::addfs(ppu.vr[op.va], ppu.vr[op.vb]);
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto result = v128::addfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}
@ -910,7 +941,11 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_add_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vc].vf;
const auto c = ppu.vr[op.vb].vf;
const auto result = _mm_add_ps(_mm_mul_ps(a, b), c);
ppu.vr[op.vd] = vec_handle_nan(result, a, b, c);
return true;
}
@ -924,7 +959,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
// TODO: Optimize
for (u32 i = 0; i < 4; i++)
{
d[i] = f32(f64{a[i]} * f64{c[i]} + f64{b[i]});
d[i] = f32_handle_nan(f32(f64{a[i]} * f64{c[i]} + f64{b[i]}));
}
return true;
@ -932,7 +967,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf);
ppu.vr[op.vd] = vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
return true;
}
@ -1077,7 +1112,8 @@ bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vb].vf;
ppu.vr[op.vd].vf = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}
@ -1414,7 +1450,10 @@ bool ppu_interpreter::VMULOUH(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf), _mm_set1_ps(-0.0f));
const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
const auto b = _mm_set1_ps(-0.0f);
const auto result = _mm_xor_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}
@ -1813,7 +1852,10 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_div_ps(_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f), ppu.vr[op.vb].vf);
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf;
const auto result = _mm_div_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}
@ -1824,7 +1866,7 @@ bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
for (uint w = 0; w < 4; w++)
{
d._f[w] = std::floor(b._f[w]);
d._f[w] = f32_handle_nan(std::floor(b._f[w]));
}
return true;
}
@ -1836,7 +1878,7 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
for (uint w = 0; w < 4; w++)
{
d._f[w] = std::nearbyint(b._f[w]);
d._f[w] = f32_handle_nan(std::nearbyint(b._f[w]));
}
return true;
}
@ -1848,7 +1890,7 @@ bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
for (uint w = 0; w < 4; w++)
{
d._f[w] = std::ceil(b._f[w]);
d._f[w] = f32_handle_nan(std::ceil(b._f[w]));
}
return true;
}
@ -1860,7 +1902,7 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
for (uint w = 0; w < 4; w++)
{
d._f[w] = std::truncf(b._f[w]);
d._f[w] = f32_handle_nan(std::truncf(b._f[w]));
}
return true;
}
@ -1906,7 +1948,10 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd].vf = _mm_div_ps(_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f), _mm_sqrt_ps(ppu.vr[op.vb].vf));
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf;
const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}
@ -2203,7 +2248,10 @@ bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = v128::subfs(ppu.vr[op.va], ppu.vr[op.vb]);
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto result = v128::subfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
return true;
}