mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-10 16:14:29 +00:00
Implement cpu_translator::pshufb<>()
Remove spu_translator::pshufb<>() Improve PSHUFB emulation (pre-SSSE3) Emit static shufflevector for the constant mask PPU: Inline VPERM instruction
This commit is contained in:
parent
41eab62ed7
commit
a0bf103e8b
@ -903,6 +903,9 @@ protected:
|
||||
// Endianness, affects vector element numbering (TODO)
|
||||
bool m_is_be;
|
||||
|
||||
// Allow PSHUFB intrinsic
|
||||
bool m_use_ssse3;
|
||||
|
||||
// IR builder
|
||||
llvm::IRBuilder<>* m_ir;
|
||||
|
||||
@ -1173,6 +1176,69 @@ public:
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<u8[16]> pshufb(T1 a, T2 b)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto index = b.eval(m_ir);
|
||||
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
|
||||
|
||||
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
|
||||
{
|
||||
// Convert PSHUFB index back to LLVM vector shuffle mask
|
||||
v128 mask{};
|
||||
|
||||
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
|
||||
|
||||
if (cv)
|
||||
{
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
{
|
||||
const u64 b = cv->getElementAsInteger(i);
|
||||
mask._u8[i] = b < 128 ? b % 16 : 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
|
||||
{
|
||||
result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const u8*)mask._bytes, 16));
|
||||
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
|
||||
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_use_ssse3)
|
||||
{
|
||||
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {data0, index});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Emulate PSHUFB (TODO)
|
||||
const auto mask = m_ir->CreateAnd(index, 0xf);
|
||||
const auto loop = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent());
|
||||
const auto next = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent());
|
||||
const auto prev = m_ir->GetInsertBlock();
|
||||
|
||||
m_ir->CreateBr(loop);
|
||||
m_ir->SetInsertPoint(loop);
|
||||
const auto i = m_ir->CreatePHI(get_type<u32>(), 2);
|
||||
const auto v = m_ir->CreatePHI(get_type<u8[16]>(), 2);
|
||||
i->addIncoming(m_ir->getInt32(0), prev);
|
||||
i->addIncoming(m_ir->CreateAdd(i, m_ir->getInt32(1)), loop);
|
||||
v->addIncoming(zeros, prev);
|
||||
result.value = m_ir->CreateInsertElement(v, m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i)), i);
|
||||
v->addIncoming(result.value, loop);
|
||||
m_ir->CreateCondBr(m_ir->CreateICmpULT(i, m_ir->getInt32(16)), loop, next);
|
||||
m_ir->SetInsertPoint(next);
|
||||
result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zeros), zeros, result.value);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename R = v128>
|
||||
R get_const_vector(llvm::Constant*, u32 a, u32 b);
|
||||
|
||||
|
@ -1302,7 +1302,7 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
{ "__stdcx", (u64)&ppu_stdcx },
|
||||
{ "__vexptefp", (u64)&sse_exp2_ps },
|
||||
{ "__vlogefp", (u64)&sse_log2_ps },
|
||||
{ "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 },
|
||||
{ "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 }, // Obsolete
|
||||
{ "__lvsl", (u64)&sse_altivec_lvsl },
|
||||
{ "__lvsr", (u64)&sse_altivec_lvsr },
|
||||
{ "__lvlx", s_use_ssse3 ? (u64)&sse_cellbe_lvlx : (u64)&sse_cellbe_lvlx_v0 },
|
||||
@ -1685,7 +1685,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
|
||||
module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
|
||||
|
||||
// Initialize translator
|
||||
PPUTranslator translator(jit.get_context(), module.get(), module_part);
|
||||
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3());
|
||||
|
||||
// Define some types
|
||||
const auto _void = Type::getVoidTy(jit.get_context());
|
||||
|
@ -11,13 +11,14 @@ using namespace llvm;
|
||||
|
||||
const ppu_decoder<PPUTranslator> s_ppu_decoder;
|
||||
|
||||
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info)
|
||||
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3)
|
||||
: cpu_translator(module, false)
|
||||
, m_info(info)
|
||||
, m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone}))
|
||||
{
|
||||
// Bind context
|
||||
m_context = context;
|
||||
m_use_ssse3 = ssse3;
|
||||
|
||||
// There is no weak linkage on JIT, so let's create variables with different names for each module part
|
||||
const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr;
|
||||
@ -1193,8 +1194,11 @@ void PPUTranslator::VOR(ppu_opcode_t op)
|
||||
|
||||
void PPUTranslator::VPERM(ppu_opcode_t op)
|
||||
{
|
||||
const auto abc = GetVrs(VrType::vi8, op.va, op.vb, op.vc);
|
||||
SetVr(op.vd, Call(GetType<u8[16]>(), m_pure_attr, "__vperm", abc[0], abc[1], abc[2]));
|
||||
const auto a = get_vr<u8[16]>(op.va);
|
||||
const auto b = get_vr<u8[16]>(op.vb);
|
||||
const auto c = get_vr<u8[16]>(op.vc);
|
||||
const auto i = eval(~c & 0x1f);
|
||||
set_vr(op.vd, select(bitcast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
|
||||
}
|
||||
|
||||
void PPUTranslator::VPKPX(ppu_opcode_t op)
|
||||
|
@ -313,7 +313,7 @@ public:
|
||||
// Handle compilation errors
|
||||
void CompilationError(const std::string& error);
|
||||
|
||||
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info);
|
||||
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3);
|
||||
~PPUTranslator();
|
||||
|
||||
// Get thread context struct type
|
||||
|
@ -1892,36 +1892,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
m_ir->CreateRetVoid();
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<u8[16]> pshufb(T1 a, T2 b)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
|
||||
if (m_spurt->m_jit.has_ssse3())
|
||||
{
|
||||
result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {a.eval(m_ir), b.eval(m_ir)});
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto index = b.eval(m_ir);
|
||||
const auto mask = m_ir->CreateAnd(index, 0xf);
|
||||
const auto zero = llvm::ConstantInt::get(get_type<u8[16]>(), 0u);
|
||||
|
||||
result.value = zero;
|
||||
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
{
|
||||
const auto x = m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i));
|
||||
result.value = m_ir->CreateInsertElement(result.value, x, i);
|
||||
}
|
||||
|
||||
result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zero), zero, result.value);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public:
|
||||
spu_llvm_recompiler()
|
||||
: spu_recompiler_base()
|
||||
@ -1942,6 +1912,7 @@ public:
|
||||
m_cache = fxm::get<spu_cache>();
|
||||
m_spurt = fxm::get_always<spu_llvm_runtime>();
|
||||
m_context = m_spurt->m_jit.get_context();
|
||||
m_use_ssse3 = m_spurt->m_jit.has_ssse3();
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user