From 1685769bd98222cec07ad0e8c885d41a5cf05b58 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Wed, 1 Sep 2021 20:43:57 +0300 Subject: [PATCH] LLVM DSL: reimplement fmuladd, force hw fma if present --- rpcs3/Emu/CPU/CPUTranslator.h | 83 +++++++++++++++++++++++++++----- rpcs3/Emu/Cell/SPURecompiler.cpp | 24 ++------- 2 files changed, 75 insertions(+), 32 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 406804dcf8..01949a52f6 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2679,6 +2679,63 @@ struct llvm_fabs } }; +template > +struct llvm_fmuladd +{ + using type = T; + + llvm_expr_t a1; + llvm_expr_t a2; + llvm_expr_t a3; + bool strict_fma; + + static_assert(llvm_value_t::is_float, "llvm_fmuladd<>: invalid type"); + + static constexpr bool is_ok = llvm_value_t::is_float; + + llvm::Value* eval(llvm::IRBuilder<>* ir) const + { + llvm::Value* v1 = a1.eval(ir); + llvm::Value* v2 = a2.eval(ir); + llvm::Value* v3 = a3.eval(ir); + + if (llvm::isa(v1) && llvm::isa(v2) && llvm::isa(v3)) + { + return llvm::ConstantFoldInstruction(ir->CreateIntrinsic(llvm::Intrinsic::fma, {v1->getType()}, {v1, v2, v3}), llvm::DataLayout("")); + } + + return ir->CreateIntrinsic(strict_fma ? llvm::Intrinsic::fma : llvm::Intrinsic::fmuladd, {v1->getType()}, {v1, v2, v3}); + } + + llvm_match_tuple match(llvm::Value*& value) const + { + llvm::Value* v1 = {}; + llvm::Value* v2 = {}; + llvm::Value* v3 = {}; + + if (auto i = llvm::dyn_cast_or_null(value); i && i->getIntrinsicID() == (strict_fma ? llvm::Intrinsic::fma : llvm::Intrinsic::fmuladd)) + { + v1 = i->getOperand(0); + v2 = i->getOperand(1); + v3 = i->getOperand(2); + + if (auto r1 = a1.match(v1); v1) + { + if (auto r2 = a2.match(v2); v2) + { + if (auto r3 = a3.match(v3); v3) + { + return std::tuple_cat(r1, r2, r3); + } + } + } + } + + value = nullptr; + return {}; + } +}; + class cpu_translator { protected: @@ -2990,6 +3047,20 @@ public: return llvm_fabs{std::forward(a)}; } + // Optionally opportunistic hardware FMA, can be used if results are identical for all possible input values + template ::is_ok>> + static auto fmuladd(T&& a, U&& b, V&& c, bool strict_fma) + { + return llvm_fmuladd{std::forward(a), std::forward(b), std::forward(c), strict_fma}; + } + + // Opportunistic hardware FMA, can be used if results are identical for all possible input values + template ::is_ok>> + auto fmuladd(T&& a, U&& b, V&& c) + { + return llvm_fmuladd{std::forward(a), std::forward(b), std::forward(c), m_use_fma}; + } + template llvm::Function* get_intrinsic(llvm::Intrinsic::ID id) { @@ -2997,18 +3068,6 @@ public: return llvm::Intrinsic::getDeclaration(_module, id, {get_type()...}); } - // Opportunistic hardware FMA, can be used if results are identical for all possible input values - template - auto fmuladd(T a, T b, T c) - { - value_t result; - const auto av = a.eval(m_ir); - const auto bv = b.eval(m_ir); - const auto cv = c.eval(m_ir); - result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fmuladd), {av, bv, cv}); - return result; - } - // TODO: Support doubles template ::esize == 32u && llvm_value_t::is_float>> auto fre(T a) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 63425b79cb..0faee4c347 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7751,11 +7751,7 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); if (g_cfg.core.llvm_accurate_dfma) - { - value_t r; - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {a.value, b.value, c.value}); - set_vr(op.rt, r); - } + set_vr(op.rt, fmuladd(a, b, c, true)); else set_vr(op.rt, a * b + c); } @@ -7765,11 +7761,7 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); if (g_cfg.core.llvm_accurate_dfma) - { - value_t r; - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {a.value, b.value, eval(-c).value}); - set_vr(op.rt, r); - } + set_vr(op.rt, fmuladd(a, b, -c, true)); else set_vr(op.rt, a * b - c); } @@ -7779,11 +7771,7 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); if (g_cfg.core.llvm_accurate_dfma) - { - value_t r; - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {eval(-a).value, b.value, c.value}); - set_vr(op.rt, r); - } + set_vr(op.rt, fmuladd(-a, b, c, true)); else set_vr(op.rt, c - (a * b)); } @@ -7793,11 +7781,7 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rt); if (g_cfg.core.llvm_accurate_dfma) - { - value_t r; - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {a.value, b.value, c.value}); - set_vr(op.rt, -r); - } + set_vr(op.rt, -fmuladd(a, b, c, true)); else set_vr(op.rt, -(a * b + c)); }