diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index c04292a070..102b30307a 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2801,6 +2801,44 @@ public: return result; } + template + value_t vpermb(T1 a, T2 b) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto index = b.eval(m_ir); + const auto zeros = llvm::ConstantAggregateZero::get(get_type()); + + if (auto c = llvm::dyn_cast(index)) + { + // Convert VPERMB index back to LLVM vector shuffle mask + v128 mask{}; + + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b = cv->getElementAsInteger(i); + mask._u8[i] = b & 0xf; + } + } + + if (cv || llvm::isa(c)) + { + result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast(&mask), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(data0, zeros, result.value); + return result; + } + } + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_permvar_qi_128), {data0, index}); + return result; + } + template value_t vperm2b(T1 a, T2 b, T3 c) { @@ -2813,11 +2851,23 @@ public: if (auto c = llvm::dyn_cast(index)) { // Convert VPERM2B index back to LLVM vector shuffle mask + v128 mask{}; + const auto cv = llvm::dyn_cast(c); + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b = cv->getElementAsInteger(i); + mask._u8[i] = b & 0x1f; + } + } + if (cv || llvm::isa(c)) { - result.value = m_ir->CreateZExt(cv, get_type()); + result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast(&mask), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); result.value = m_ir->CreateShuffleVector(data0, data1, result.value); return result; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 29bf6c716d..6bcc5e1e91 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -6521,14 +6521,27 @@ public: { const auto as = byteswap(a); const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = (sc + (splat_scalar(get_vr(op.rb)) >> 3)) & 0xf; - set_vr(op.rt, pshufb(as, sh)); + const auto sh = sc + (splat_scalar(get_vr(op.rb)) >> 3); + + if (m_use_avx512_icl) + { + set_vr(op.rt, vpermb(as, sh)); + return; + } + + set_vr(op.rt, pshufb(as, (sh & 0xf))); + return; + } + const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const auto sh = sc - (splat_scalar(get_vr(op.rb)) >> 3); + + if (m_use_avx512_icl) + { + set_vr(op.rt, vpermb(a, sh)); return; } - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = (sc - (splat_scalar(get_vr(op.rb)) >> 3)) & 0xf; - set_vr(op.rt, pshufb(a, sh)); + set_vr(op.rt, pshufb(a, (sh & 0xf))); } void ROTQMBYBI(spu_opcode_t op) @@ -6651,14 +6664,28 @@ public: { const auto as = byteswap(a); const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = eval((sc + splat_scalar(b)) & 0xf); - set_vr(op.rt, pshufb(as, sh)); + const auto sh = eval(sc + splat_scalar(b)); + + if (m_use_avx512_icl) + { + set_vr(op.rt, vpermb(as, sh)); + return; + } + + set_vr(op.rt, pshufb(as, (sh & 0xf))); return; } const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = eval((sc - splat_scalar(b)) & 0xf); - set_vr(op.rt, pshufb(a, sh)); + const auto sh = eval(sc - splat_scalar(b)); + + if (m_use_avx512_icl) + { + set_vr(op.rt, vpermb(a, sh)); + return; + } + + set_vr(op.rt, pshufb(a, (sh & 0xf))); } void ROTQMBY(spu_opcode_t op)