Implement cpu_translator::pshufb<>()

Remove spu_translator::pshufb<>()
Improve PSHUFB emulation (pre-SSSE3)
Emit static shufflevector for the constant mask
PPU: Inline VPERM instruction
This commit is contained in:
Nekotekina 2018-06-28 16:21:08 +03:00
parent 41eab62ed7
commit a0bf103e8b
5 changed files with 77 additions and 36 deletions

View File

@ -903,6 +903,9 @@ protected:
// Endianness, affects vector element numbering (TODO)
bool m_is_be;
// Allow PSHUFB intrinsic
bool m_use_ssse3;
// IR builder
llvm::IRBuilder<>* m_ir;
@ -1173,6 +1176,69 @@ public:
return result;
}
template <typename T1, typename T2>
value_t<u8[16]> pshufb(T1 a, T2 b)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto index = b.eval(m_ir);
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
// Convert PSHUFB index back to LLVM vector shuffle mask
v128 mask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
for (u32 i = 0; i < 16; i++)
{
const u64 b = cv->getElementAsInteger(i);
mask._u8[i] = b < 128 ? b % 16 : 16;
}
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const u8*)mask._bytes, 16));
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
return result;
}
}
if (m_use_ssse3)
{
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {data0, index});
}
else
{
// Emulate PSHUFB (TODO)
const auto mask = m_ir->CreateAnd(index, 0xf);
const auto loop = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent());
const auto next = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent());
const auto prev = m_ir->GetInsertBlock();
m_ir->CreateBr(loop);
m_ir->SetInsertPoint(loop);
const auto i = m_ir->CreatePHI(get_type<u32>(), 2);
const auto v = m_ir->CreatePHI(get_type<u8[16]>(), 2);
i->addIncoming(m_ir->getInt32(0), prev);
i->addIncoming(m_ir->CreateAdd(i, m_ir->getInt32(1)), loop);
v->addIncoming(zeros, prev);
result.value = m_ir->CreateInsertElement(v, m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i)), i);
v->addIncoming(result.value, loop);
m_ir->CreateCondBr(m_ir->CreateICmpULT(i, m_ir->getInt32(16)), loop, next);
m_ir->SetInsertPoint(next);
result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zeros), zeros, result.value);
}
return result;
}
template <typename R = v128>
R get_const_vector(llvm::Constant*, u32 a, u32 b);

View File

@ -1302,7 +1302,7 @@ extern void ppu_initialize(const ppu_module& info)
{ "__stdcx", (u64)&ppu_stdcx },
{ "__vexptefp", (u64)&sse_exp2_ps },
{ "__vlogefp", (u64)&sse_log2_ps },
{ "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 },
{ "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 }, // Obsolete
{ "__lvsl", (u64)&sse_altivec_lvsl },
{ "__lvsr", (u64)&sse_altivec_lvsr },
{ "__lvlx", s_use_ssse3 ? (u64)&sse_cellbe_lvlx : (u64)&sse_cellbe_lvlx_v0 },
@ -1685,7 +1685,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
// Initialize translator
PPUTranslator translator(jit.get_context(), module.get(), module_part);
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3());
// Define some types
const auto _void = Type::getVoidTy(jit.get_context());

View File

@ -11,13 +11,14 @@ using namespace llvm;
const ppu_decoder<PPUTranslator> s_ppu_decoder;
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info)
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3)
: cpu_translator(module, false)
, m_info(info)
, m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone}))
{
// Bind context
m_context = context;
m_use_ssse3 = ssse3;
// There is no weak linkage on JIT, so let's create variables with different names for each module part
const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr;
@ -1193,8 +1194,11 @@ void PPUTranslator::VOR(ppu_opcode_t op)
void PPUTranslator::VPERM(ppu_opcode_t op)
{
const auto abc = GetVrs(VrType::vi8, op.va, op.vb, op.vc);
SetVr(op.vd, Call(GetType<u8[16]>(), m_pure_attr, "__vperm", abc[0], abc[1], abc[2]));
const auto a = get_vr<u8[16]>(op.va);
const auto b = get_vr<u8[16]>(op.vb);
const auto c = get_vr<u8[16]>(op.vc);
const auto i = eval(~c & 0x1f);
set_vr(op.vd, select(bitcast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
}
void PPUTranslator::VPKPX(ppu_opcode_t op)

View File

@ -313,7 +313,7 @@ public:
// Handle compilation errors
void CompilationError(const std::string& error);
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info);
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3);
~PPUTranslator();
// Get thread context struct type

View File

@ -1892,36 +1892,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
m_ir->CreateRetVoid();
}
template <typename T1, typename T2>
value_t<u8[16]> pshufb(T1 a, T2 b)
{
value_t<u8[16]> result;
if (m_spurt->m_jit.has_ssse3())
{
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {a.eval(m_ir), b.eval(m_ir)});
}
else
{
const auto data0 = a.eval(m_ir);
const auto index = b.eval(m_ir);
const auto mask = m_ir->CreateAnd(index, 0xf);
const auto zero = llvm::ConstantInt::get(get_type<u8[16]>(), 0u);
result.value = zero;
for (u32 i = 0; i < 16; i++)
{
const auto x = m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i));
result.value = m_ir->CreateInsertElement(result.value, x, i);
}
result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zero), zero, result.value);
}
return result;
}
public:
spu_llvm_recompiler()
: spu_recompiler_base()
@ -1942,6 +1912,7 @@ public:
m_cache = fxm::get<spu_cache>();
m_spurt = fxm::get_always<spu_llvm_runtime>();
m_context = m_spurt->m_jit.get_context();
m_use_ssse3 = m_spurt->m_jit.has_ssse3();
}
}