From 065165f7498053749cfe80d29bb7b78e5b7cf308 Mon Sep 17 00:00:00 2001
From: Sintendo <3380580+Sintendo@users.noreply.github.com>
Date: Sat, 7 Dec 2024 10:53:53 +0100
Subject: [PATCH] Jit_FloatingPoint: Prefer BLENDVPD over VBLENDVPD

Prefer BLENDVPD over VBLENDVPD if the latter doesn't save any
instructions.

VBLENDVPD allows separate source and destination registers, which can
eliminate a MOVAPD/MOVSD. However, on Intel since Skylake, VBLENDVPD
takes additional uops to execute compared to BLENDVPD (according to
https://uops.info). On AMD and older Intel microarchitectures there is no
difference.
---
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp     | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 4723c1c2cf..75cfbed3d6 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -644,6 +644,20 @@ void Jit64::fselx(UGeckoInstruction inst)
 
   if (cpu_info.bAVX)
   {
+    // Prefer BLENDVPD over VBLENDVPD if the latter doesn't save any
+    // instructions.
+    //
+    // VBLENDVPD allows separate source and destination registers, which can
+    // eliminate a MOVAPD/MOVSD. However, on Intel since Skylake, VBLENDVPD
+    // takes additional uops to execute compared to BLENDVPD (according to
+    // https://uops.info). On AMD and older Intel microarchitectures there is no
+    // difference.
+    if (d == c)
+    {
+      BLENDVPD(Rd, Rb);
+      return;
+    }
+
     X64Reg src1 = XMM1;
     if (Rc.IsSimpleReg())
     {
@@ -654,7 +668,7 @@ void Jit64::fselx(UGeckoInstruction inst)
       MOVAPD(XMM1, Rc);
     }
 
-    if (d == c || packed)
+    if (packed)
     {
       VBLENDVPD(Rd, src1, Rb, XMM0);
       return;