PPU Interpreter: Accurate vector instruction NaNs

Tested with https://github.com/RPCS3/ps3autotests/tree/master/tests/cpu/ppu_vpu. This commit gets us from 2746 to 353 different lines compared to realhw.
2025-03-15 13:21:14 +00:00 · 2020-05-03 09:32:10 +03:00 · 2020-05-03 09:32:10 +03:00 · 20d8d38e53
commit 20d8d38e53
parent 78ac2a86bb
1 changed files with 62 additions and 14 deletions
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -8,8 +8,9 @@
 #include "Utilities/sysinfo.h"
 #include "Emu/Cell/Common.h"

-#include <cmath>
 #include <atomic>
+#include <bit>
+#include <cmath>

 #if !defined(_MSC_VER) && defined(__clang__)
 #pragma GCC diagnostic push
@ -358,6 +359,33 @@ public:
 }
 const g_ppu_scale_table;

+constexpr u32 ppu_nan_u32 = 0x7FC00000u;
+const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
+
+template<typename... Args>
+v128 vec_handle_nan(v128 result, Args... args)
+{
+	const auto is_nan = v128::fromF(_mm_cmpunord_ps(result.vf, result.vf));
+	const auto nans = v128::from32p(ppu_nan_u32);
+	result = (nans & is_nan) | v128::andnot(is_nan, result);
+
+	return result;
+}
+
+template<typename... Args>
+v128 vec_handle_nan(__m128 result, Args... args)
+{
+	return vec_handle_nan(v128::fromF(result));
+}
+
+template<typename... Args>
+float f32_handle_nan(float result, Args... args)
+{
+	if (std::isnan(result))
+		result = ppu_nan_f32;
+	return result;
+}
+
 bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
 {
 	ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
@ -382,7 +410,10 @@ bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd] = v128::addfs(ppu.vr[op.va], ppu.vr[op.vb]);
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	const auto result = v128::addfs(a, b);
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }

@ -910,7 +941,11 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd].vf = _mm_add_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
+	const auto a = ppu.vr[op.va].vf;
+	const auto b = ppu.vr[op.vc].vf;
+	const auto c = ppu.vr[op.vb].vf;
+	const auto result = _mm_add_ps(_mm_mul_ps(a, b), c);
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b, c);
 	return true;
 }

@ -924,7 +959,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
 	// TODO: Optimize
 	for (u32 i = 0; i < 4; i++)
 	{
-		d[i] = f32(f64{a[i]} * f64{c[i]} + f64{b[i]});
+		d[i] = f32_handle_nan(f32(f64{a[i]} * f64{c[i]} + f64{b[i]}));
 	}

 	return true;
@ -932,7 +967,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd].vf = _mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf);
+	ppu.vr[op.vd] = vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
 	return true;
 }

@ -1077,7 +1112,8 @@ bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const auto a = ppu.vr[op.va].vf;
 	const auto b = ppu.vr[op.vb].vf;
-	ppu.vr[op.vd].vf = _mm_or_ps(_mm_min_ps(a, b),  _mm_min_ps(b, a));
+	const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }

@ -1414,7 +1450,10 @@ bool ppu_interpreter::VMULOUH(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd].vf = _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf), _mm_set1_ps(-0.0f));
+	const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
+	const auto b = _mm_set1_ps(-0.0f);
+	const auto result = _mm_xor_ps(a, b);
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }

@ -1813,7 +1852,10 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd].vf = _mm_div_ps(_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f), ppu.vr[op.vb].vf);
+	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+	const auto b = ppu.vr[op.vb].vf;
+	const auto result = _mm_div_ps(a, b);
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }

@ -1824,7 +1866,7 @@ bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)

 	for (uint w = 0; w < 4; w++)
 	{
-		d._f[w] = std::floor(b._f[w]);
+		d._f[w] = f32_handle_nan(std::floor(b._f[w]));
 	}
 	return true;
 }
@ -1836,7 +1878,7 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)

 	for (uint w = 0; w < 4; w++)
 	{
-		d._f[w] = std::nearbyint(b._f[w]);
+		d._f[w] = f32_handle_nan(std::nearbyint(b._f[w]));
 	}
 	return true;
 }
@ -1848,7 +1890,7 @@ bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)

 	for (uint w = 0; w < 4; w++)
 	{
-		d._f[w] = std::ceil(b._f[w]);
+		d._f[w] = f32_handle_nan(std::ceil(b._f[w]));
 	}
 	return true;
 }
@ -1860,7 +1902,7 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)

 	for (uint w = 0; w < 4; w++)
 	{
-		d._f[w] = std::truncf(b._f[w]);
+		d._f[w] = f32_handle_nan(std::truncf(b._f[w]));
 	}
 	return true;
 }
@ -1906,7 +1948,10 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd].vf = _mm_div_ps(_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f), _mm_sqrt_ps(ppu.vr[op.vb].vf));
+	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+	const auto b = ppu.vr[op.vb].vf;
+	const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }

@ -2203,7 +2248,10 @@ bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd] = v128::subfs(ppu.vr[op.va], ppu.vr[op.vb]);
+	const auto a = ppu.vr[op.va];
+	const auto b = ppu.vr[op.vb];
+	const auto result = v128::subfs(a, b);
+	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
 	return true;
 }