From 065165f7498053749cfe80d29bb7b78e5b7cf308 Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sat, 7 Dec 2024 10:53:53 +0100 Subject: [PATCH] Jit_FloatingPoint: Prefer BLENDVPD over VBLENDVPD Prefer BLENDVPD over VBLENDVPD if the latter doesn't save any instructions. VBLENDVPD allows separate source and destination registers, which can eliminate a MOVAPD/MOVSD. However, on Intel since Skylake, VBLENDVPD takes additional uops to execute compared to BLENDVPD (according to https://uops.info). On AMD and older Intel microarchitectures there is no difference. --- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 4723c1c2cf..75cfbed3d6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -644,6 +644,20 @@ void Jit64::fselx(UGeckoInstruction inst) if (cpu_info.bAVX) { + // Prefer BLENDVPD over VBLENDVPD if the latter doesn't save any + // instructions. + // + // VBLENDVPD allows separate source and destination registers, which can + // eliminate a MOVAPD/MOVSD. However, on Intel since Skylake, VBLENDVPD + // takes additional uops to execute compared to BLENDVPD (according to + // https://uops.info). On AMD and older Intel microarchitectures there is no + // difference. + if (d == c) + { + BLENDVPD(Rd, Rb); + return; + } + X64Reg src1 = XMM1; if (Rc.IsSimpleReg()) { @@ -654,7 +668,7 @@ void Jit64::fselx(UGeckoInstruction inst) MOVAPD(XMM1, Rc); } - if (d == c || packed) + if (packed) { VBLENDVPD(Rd, src1, Rb, XMM0); return;