PPU LLVM: Use VPERM2B to emulate VPERM (#8704)

- The VPERM2B instructions are a match of VPERM's behavior, besides operating in reverse byte order
This commit is contained in:
Whatcookie 2020-08-08 20:50:26 -04:00 committed by GitHub
parent 0c85d4c0d0
commit 4ce2ad54a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 0 deletions

View File

@ -57,6 +57,15 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin
{
m_use_fma = true;
}
// Test AVX-512_icelake features (TODO)
if (cpu == "icelake" ||
cpu == "icelake-client" ||
cpu == "icelake-server" ||
cpu == "tigerlake")
{
m_use_avx512_icl = true;
}
}
llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type)

View File

@ -2423,6 +2423,9 @@ protected:
// Allow FMA
bool m_use_fma = false;
// Allow Icelake tier AVX-512
bool m_use_avx512_icl = false;
// IR builder
llvm::IRBuilder<>* m_ir;
@ -2782,6 +2785,33 @@ public:
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u8[16]> vperm2b(T1 a, T2 b, T3 c)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto index = c.eval(m_ir);
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
// Convert VPERM2B index back to LLVM vector shuffle mask
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
result.value = m_ir->CreateZExt(cv, get_type<u32[16]>());
result.value = m_ir->CreateShuffleVector(data0, data1, result.value);
return result;
}
}
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_avx512_vpermi2var_qi_128), {data0, index, data1});
return result;
}
template <typename T1, typename T2>
value_t<u8[16]> pshufb(T1 a, T2 b)
{

View File

@ -1295,6 +1295,14 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
const auto a = get_vr<u8[16]>(op.va);
const auto b = get_vr<u8[16]>(op.vb);
const auto c = get_vr<u8[16]>(op.vc);
if (m_use_avx512_icl && op.ra != op.rb)
{
const auto i = eval(~c);
set_vr(op.vd, vperm2b(b, a, i));
return;
}
const auto i = eval(~c & 0x1f);
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
}