diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index b26834c1c8..6913e43dde 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7440,14 +7440,27 @@ public: value_t fma32x4(value_t a, value_t b, value_t c) { + value_t r; const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); const auto ca = eval(bitcast(bitcast(a) & mb)); const auto cb = eval(bitcast(bitcast(b) & ma)); + + // Optimization: Emit only a floating multiply if the addend is zero + // This is odd since SPU code could just use the FM instruction, but it seems common enough + if (auto cv = llvm::dyn_cast(c.value)) + { + v128 data = get_const_vector(cv, m_pos, 4000); + + if (data == v128{}) + { + r = eval(ca * cb); + return r; + } + } if (m_use_fma) { - value_t r; r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {ca.value, cb.value, c.value}); return r; } @@ -7457,7 +7470,6 @@ public: const auto xb = m_ir->CreateFPExt(cb.value, get_type()); const auto xc = m_ir->CreateFPExt(c.value, get_type()); const auto xr = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fmuladd), {xa, xb, xc}); - value_t r; r.value = m_ir->CreateFPTrunc(xr, get_type()); return r; }