SPU: Optimize FMA ops with 0 addend

This commit is contained in:
Malcolm Jestadt 2020-05-01 05:05:24 -04:00 committed by Ivan
parent a568c958af
commit c1bd154bcd

View File

@ -7440,14 +7440,27 @@ public:
value_t<f32[4]> fma32x4(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
{
value_t<f32[4]> r;
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
// Optimization: Emit only a floating multiply if the addend is zero
// This is odd since SPU code could just use the FM instruction, but it seems common enough
if (auto cv = llvm::dyn_cast<llvm::Constant>(c.value))
{
v128 data = get_const_vector(cv, m_pos, 4000);
if (data == v128{})
{
r = eval(ca * cb);
return r;
}
}
if (m_use_fma)
{
value_t<f32[4]> r;
r.value = m_ir->CreateCall(get_intrinsic<f32[4]>(llvm::Intrinsic::fma), {ca.value, cb.value, c.value});
return r;
}
@ -7457,7 +7470,6 @@ public:
const auto xb = m_ir->CreateFPExt(cb.value, get_type<f64[4]>());
const auto xc = m_ir->CreateFPExt(c.value, get_type<f64[4]>());
const auto xr = m_ir->CreateCall(get_intrinsic<f64[4]>(llvm::Intrinsic::fmuladd), {xa, xb, xc});
value_t<f32[4]> r;
r.value = m_ir->CreateFPTrunc(xr, get_type<f32[4]>());
return r;
}