diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 453e5d15e2..68985632d9 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -3988,6 +3988,7 @@ void spu_recompiler::FI(spu_opcode_t op) const XmmLink& vb_base = XmmAlloc(); const XmmLink& ymul = XmmAlloc(); const XmmLink& temp_reg = XmmAlloc(); + const XmmLink& temp_reg2 = XmmAlloc(); c->movdqa(vb_base, vb); c->movdqa(ymul, vb); @@ -4002,16 +4003,29 @@ void spu_recompiler::FI(spu_opcode_t op) c->movdqa(temp_reg, vb_base); c->psubd(temp_reg, ymul); - c->psrld(temp_reg, 9); - c->pcmpgtd(vb_base, ymul); - c->pand(vb_base, XmmConst(v128::from32p(1 << 23))); - c->paddd(temp_reg, vb_base); + // Makes signed comparison unsigned and determines if we need to adjust exponent + auto xor_const = XmmConst(v128::from32p(0x80000000)); + c->pxor(ymul, xor_const); + c->pxor(vb_base, xor_const); + c->pcmpgtd(ymul, vb_base); + c->movdqa(vb_base, ymul); + + c->movdqa(temp_reg2, temp_reg); + c->pand(temp_reg2, vb_base); + c->psrld(temp_reg2, 8); // only shift right by 8 if exponent is adjusted + c->xorps(vb_base, XmmConst(v128::from32p(0xFFFFFFFF))); // Invert the mask + c->pand(temp_reg, vb_base); + c->psrld(temp_reg, 9); // shift right by 9 if not adjusted + c->por(temp_reg, temp_reg2); c->pand(vb, XmmConst(v128::from32p(0xff800000u))); c->pand(temp_reg, XmmConst(v128::from32p(~0xff800000u))); c->por(vb, temp_reg); + c->pand(ymul, XmmConst(v128::from32p(1 << 23))); + c->psubd(vb, ymul); + c->movaps(SPU_OFF_128(gpr, op.rt), vb); } diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index a60baa6962..bde5f2ec0f 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6075,45 +6075,24 @@ public: { register_intrinsic("spu_fi", [&](llvm::CallInst* ci) { + // TODO: adjustment for denormals(for accurate xfloat only?) const auto a = bitcast(value(ci->getOperand(0))); const auto b = bitcast(value(ci->getOperand(1))); const auto base = (b & 0x007ffc00u) << 9; // Base fraction const auto ymul = (b & 0x3ff) * (a & 0x7ffff); // Step fraction * Y fraction (fixed point at 2^-32) - const auto bnew = bitcast((base - ymul) >> 9) + (sext(ymul <= base) & (1 << 23)); // Subtract and correct invisible fraction bit - return bitcast((b & 0xff800000u) | (bitcast(fpcast(bnew)) & ~0xff800000u)); // Inject old sign and exponent + const auto comparison = (ymul > base); // Should exponent be adjusted? + const auto bnew = (base - ymul) >> (zext(comparison) ^ 9); // Shift one less bit if exponent is adjusted + const auto base_result = (b & 0xff800000u) | (bnew & ~0xff800000u); // Inject old sign and exponent + const auto adjustment = bitcast(sext(comparison)) & (1 << 23); // exponent adjustement for negative bnew + return bitcast(base_result - adjustment); }); const auto [a, b] = get_vrs(op.ra, op.rb); - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) + if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) { - const auto r = eval(fi(a, b)); - set_vr(op.rt, r); - return; - } - - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - register_intrinsic("spu_re", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999875069f) / a; - return approx_result; - }); - - register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci) - { - const auto a = value(ci->getOperand(0)); - // Gives accuracy penalty, frsqest result is within one newton-raphson iteration for accuracy - const auto approx_result = fsplat(0.999763668f) / fsqrt(fabs(a)); - return approx_result; - }); - } - else - { - // For relaxed use intrinsics, those make the results vary per cpu + // For relaxed, agressively optimize and use intrinsics, those make the results vary per cpu register_intrinsic("spu_re", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); @@ -6125,25 +6104,27 @@ public: const auto a = value(ci->getOperand(0)); return frsqe(a); }); + + if (const auto [ok, mb] = match_expr(b, frest(match())); ok && mb.eq(a)) + { + erase_stores(b); + set_vr(op.rt, spu_re(a)); + return; + } + + if (const auto [ok, mb] = match_expr(b, frsqest(match())); ok && mb.eq(a)) + { + erase_stores(b); + set_vr(op.rt, spu_rsqrte(a)); + return; + } } - if (const auto [ok, mb] = match_expr(b, frest(match())); ok && mb.eq(a)) - { - erase_stores(b); - set_vr(op.rt, spu_re(a)); - return; - } - - if (const auto [ok, mb] = match_expr(b, frsqest(match())); ok && mb.eq(a)) - { - erase_stores(b); - set_vr(op.rt, spu_rsqrte(a)); - return; - } + // Do not optimize yet for approximate until we have a full accuracy sequence const auto r = eval(fi(a, b)); - if (!m_interp_magn) - spu_log.todo("[%s:0x%05x] Unmatched spu_fi found", m_hash, m_pos); + // if (!m_interp_magn) + // spu_log.todo("[%s:0x%05x] Unmatched spu_fi found", m_hash, m_pos); set_vr(op.rt, r); }