diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index ea1ee61be5..904d52b530 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3442,6 +3442,70 @@ public: return result; } + // Emulate the behavior of VPERM2B by using a 256 bit wide VPERMB + template + value_t vperm2b256to128(T1 a, T2 b, T3 c) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto index = c.eval(m_ir); + + // May be slower than non constant path? + if (auto c = llvm::dyn_cast(index)) + { + // Convert VPERM2B index back to LLVM vector shuffle mask + v128 mask{}; + + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b = cv->getElementAsInteger(i); + mask._u8[i] = b & 0x1f; + } + } + + if (cv || llvm::isa(c)) + { + result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast(&mask), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(data0, data1, result.value); + return result; + } + } + + const auto zeroes = llvm::ConstantAggregateZero::get(get_type()); + const auto zeroes32 = llvm::ConstantAggregateZero::get(get_type()); + + value_t intermediate; + value_t shuffle; + value_t shuffleindex; + + u8 mask32[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + u8 mask16[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + // insert the second source operand into the same vector as the first source operand and expand to 256 bit width + shuffle.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast(&mask32), 32)); + shuffle.value = m_ir->CreateZExt(shuffle.value, get_type()); + intermediate.value = m_ir->CreateShuffleVector(data0, data1, shuffle.value); + + // expand the shuffle index to 256 bits with zeroes + shuffleindex.value = m_ir->CreateShuffleVector(index, zeroes, shuffle.value); + + // permute + intermediate.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_permvar_qi_256), {intermediate.value, shuffleindex.value}); + + // convert the 256 bit vector back to 128 bits + result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef(reinterpret_cast(&mask16), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(intermediate.value, zeroes32, result.value); + return result; + } + llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i) { return m_ir->CreateLoad(m_ir->CreateGEP(g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type())})); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 954149ca9b..d55f16291c 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1374,7 +1374,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op) if (m_use_avx512_icl && op.ra != op.rb) { const auto i = eval(~c); - set_vr(op.vd, vperm2b(b, a, i)); + set_vr(op.vd, vperm2b256to128(b, a, i)); return; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 5b56d01e11..9a76d32971 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7645,13 +7645,13 @@ public: { if (perm_only) { - set_vr(op.rt4, vperm2b(as, bs, c)); + set_vr(op.rt4, vperm2b256to128(as, bs, c)); return; } const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b(as, bs, c); + const auto ab = vperm2b256to128(as, bs, c); set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); return; } @@ -7707,14 +7707,14 @@ public: { if (perm_only) { - set_vr(op.rt4, vperm2b(b, a, eval(~c))); + set_vr(op.rt4, vperm2b256to128(b, a, eval(~c))); return; } const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); const auto cr = eval(~c); - const auto ab = vperm2b(b, a, cr); + const auto ab = vperm2b256to128(b, a, cr); set_vr(op.rt4, select(noncast(cr) >= 0, mm, ab)); return; }