Accurate FI

2025-02-11 15:40:51 +00:00 · 2024-01-24 05:59:17 +01:00 · 2024-01-24 05:59:17 +01:00 · a92b8acba7
commit a92b8acba7
parent 85f4c38b4e
2 changed files with 43 additions and 48 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -3988,6 +3988,7 @@ void spu_recompiler::FI(spu_opcode_t op)
 	const XmmLink& vb_base = XmmAlloc();
 	const XmmLink& ymul = XmmAlloc();
 	const XmmLink& temp_reg = XmmAlloc();
+	const XmmLink& temp_reg2 = XmmAlloc();

 	c->movdqa(vb_base, vb);
 	c->movdqa(ymul, vb);
@ -4002,16 +4003,29 @@ void spu_recompiler::FI(spu_opcode_t op)

 	c->movdqa(temp_reg, vb_base);
 	c->psubd(temp_reg, ymul);
-	c->psrld(temp_reg, 9);

-	c->pcmpgtd(vb_base, ymul);
-	c->pand(vb_base, XmmConst(v128::from32p(1 << 23)));
-	c->paddd(temp_reg, vb_base);
+	// Makes signed comparison unsigned and determines if we need to adjust exponent
+	auto xor_const = XmmConst(v128::from32p(0x80000000));
+	c->pxor(ymul, xor_const);
+	c->pxor(vb_base, xor_const);
+	c->pcmpgtd(ymul, vb_base);
+	c->movdqa(vb_base, ymul);
+
+	c->movdqa(temp_reg2, temp_reg);
+	c->pand(temp_reg2, vb_base);
+	c->psrld(temp_reg2, 8); // only shift right by 8 if exponent is adjusted
+	c->xorps(vb_base, XmmConst(v128::from32p(0xFFFFFFFF))); // Invert the mask
+	c->pand(temp_reg, vb_base);
+	c->psrld(temp_reg, 9); // shift right by 9 if not adjusted
+	c->por(temp_reg, temp_reg2);

 	c->pand(vb, XmmConst(v128::from32p(0xff800000u)));
 	c->pand(temp_reg, XmmConst(v128::from32p(~0xff800000u)));
 	c->por(vb, temp_reg);

+	c->pand(ymul, XmmConst(v128::from32p(1 << 23)));
+	c->psubd(vb, ymul);
+
 	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
 }

--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -6075,45 +6075,24 @@ public:
 	{
 		register_intrinsic("spu_fi", [&](llvm::CallInst* ci)
 		{
+			// TODO: adjustment for denormals(for accurate xfloat only?)
 			const auto a = bitcast<u32[4]>(value<f32[4]>(ci->getOperand(0)));
 			const auto b = bitcast<u32[4]>(value<f32[4]>(ci->getOperand(1)));

 			const auto base = (b & 0x007ffc00u) << 9; // Base fraction
 			const auto ymul = (b & 0x3ff) * (a & 0x7ffff); // Step fraction * Y fraction (fixed point at 2^-32)
-			const auto bnew = bitcast<s32[4]>((base - ymul) >> 9) + (sext<s32[4]>(ymul <= base) & (1 << 23)); // Subtract and correct invisible fraction bit
-			return bitcast<f32[4]>((b & 0xff800000u) | (bitcast<u32[4]>(fpcast<f32[4]>(bnew)) & ~0xff800000u)); // Inject old sign and exponent
+			const auto comparison = (ymul > base); // Should exponent be adjusted?
+			const auto bnew = (base - ymul) >> (zext<u32[4]>(comparison) ^ 9); // Shift one less bit if exponent is adjusted
+			const auto base_result = (b & 0xff800000u) | (bnew & ~0xff800000u); // Inject old sign and exponent
+			const auto adjustment = bitcast<u32[4]>(sext<s32[4]>(comparison)) & (1 << 23); // exponent adjustement for negative bnew
+			return bitcast<f32[4]>(base_result - adjustment);
 		});

 		const auto [a, b] = get_vrs<f32[4]>(op.ra, op.rb);

-		if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
+		if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed)
 		{
-			const auto r = eval(fi(a, b));
-			set_vr(op.rt, r);
-			return;
-		}
-
-		if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
-		{
-			register_intrinsic("spu_re", [&](llvm::CallInst* ci)
-			{
-				const auto a = value<f32[4]>(ci->getOperand(0));
-				// Gives accuracy penalty, frest result is within one newton-raphson iteration for accuracy
-				const auto approx_result = fsplat<f32[4]>(0.999875069f) / a;
-				return approx_result;
-			});
-
-			register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci)
-			{
-				const auto a = value<f32[4]>(ci->getOperand(0));
-				// Gives accuracy penalty, frsqest result is within one newton-raphson iteration for accuracy
-				const auto approx_result = fsplat<f32[4]>(0.999763668f) / fsqrt(fabs(a));
-				return approx_result;
-			});
-		}
-		else
-		{
-			// For relaxed use intrinsics, those make the results vary per cpu
+			// For relaxed, agressively optimize and use intrinsics, those make the results vary per cpu
 			register_intrinsic("spu_re", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));
@ -6125,25 +6104,27 @@ public:
 				const auto a = value<f32[4]>(ci->getOperand(0));
 				return frsqe(a);
 			});
+
+			if (const auto [ok, mb] = match_expr(b, frest(match<f32[4]>())); ok && mb.eq(a))
+			{
+				erase_stores(b);
+				set_vr(op.rt, spu_re(a));
+				return;
+			}
+
+			if (const auto [ok, mb] = match_expr(b, frsqest(match<f32[4]>())); ok && mb.eq(a))
+			{
+				erase_stores(b);
+				set_vr(op.rt, spu_rsqrte(a));
+				return;
+			}
 		}

-		if (const auto [ok, mb] = match_expr(b, frest(match<f32[4]>())); ok && mb.eq(a))
-		{
-			erase_stores(b);
-			set_vr(op.rt, spu_re(a));
-			return;
-		}
-
-		if (const auto [ok, mb] = match_expr(b, frsqest(match<f32[4]>())); ok && mb.eq(a))
-		{
-			erase_stores(b);
-			set_vr(op.rt, spu_rsqrte(a));
-			return;
-		}
+		// Do not optimize yet for approximate until we have a full accuracy sequence

 		const auto r = eval(fi(a, b));
-		if (!m_interp_magn)
-			spu_log.todo("[%s:0x%05x] Unmatched spu_fi found", m_hash, m_pos);
+		// if (!m_interp_magn)
+		// 	spu_log.todo("[%s:0x%05x] Unmatched spu_fi found", m_hash, m_pos);

 		set_vr(op.rt, r);
 	}