Initial PPU LLVM implementation for aarch64

This commit is contained in:
kd-11 2024-08-04 05:09:06 +03:00 committed by kd-11
parent a5f9256ac6
commit 56cc5d9355
6 changed files with 223 additions and 78 deletions

View File

@ -26,6 +26,7 @@
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/InlineAsm.h"
#ifdef _MSC_VER
#pragma warning(pop)
@ -3898,4 +3899,39 @@ struct fmt_unveil<llvm::TypeSize, void>
}
};
// Inline assembly wrappers.
// TODO: Move these to proper location and replace macros with templates
static inline
llvm::InlineAsm* compile_inline_asm(
llvm::Type* returnType,
llvm::ArrayRef<llvm::Type*> argTypes,
const std::string& code,
const std::string& constraints)
{
const auto callSig = llvm::FunctionType::get(returnType, argTypes, false);
return llvm::InlineAsm::get(callSig, code, constraints, true, false);
}
// Helper for ASM generation with dynamic number of arguments
#define LLVM_ASM(asm_, args, constraints, irb, ctx)\
do {\
std::vector<llvm::Type*> _argTypes;\
_argTypes.reserve(args.size());\
for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\
auto _returnType = llvm::Type::getVoidTy(ctx); \
llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \
auto _c = irb->CreateCall(_callee, args); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
// Helper for ASM generation with 0 args
#define LLVM_ASM_0(asm_, irb, ctx)\
do {\
const auto _voidTy = llvm::Type::getVoidTy(ctx); \
auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \
auto _c = irb->CreateCall(_callee); \
_c->setTailCall(); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
#endif

View File

@ -0,0 +1,40 @@
#pragma once
#include <util/types.hpp>
namespace rpcs3
{
union alignas(16) hypervisor_context_t
{
u64 regs[16];
struct
{
u64 pc;
u64 sp;
u64 x18;
u64 x19;
u64 x20;
u64 x21;
u64 x22;
u64 x23;
u64 x24;
u64 x25;
u64 x26;
u64 x27;
u64 x28;
u64 x29;
u64 x30;
// x0-x17 unused
} aarch64;
struct
{
u64 sp;
// Other regs unused
} x86;
};
}

View File

@ -222,7 +222,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
#endif
// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp);
c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
// Initialize args
c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
@ -291,37 +291,48 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
// for AArch64 calling convention
// Save sp for native longjmp emulation
Label native_sp_offset = c.newLabel();
c.ldr(a64::x10, arm::Mem(native_sp_offset));
// sp not allowed to be used in load/stores directly
c.mov(a64::x15, a64::sp);
c.str(a64::x15, arm::Mem(args[0], a64::x10));
// Push callee saved registers to the stack
// Push callee saved registers to the hv context
// Assume our LLVM compiled code is unsafe and can clobber our stack. GHC on aarch64 treats stack as scratch.
// We also want to store the register context at a fixed place so we can read the hypervisor state from any lcoation.
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
c.sub(a64::sp, a64::sp, Imm(112));
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.str(a64::x30, arm::Mem(a64::sp, 96));
// Pre-context save
// Layout:
// pc, sp
// x18, x19...x30
// NOTE: Do not touch x19..x30 before saving the registers!
const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address
// Sanity
ensure(hv_register_array_offset < 4096); // Imm10
c.mov(a64::x15, args[0]);
c.add(a64::x14, a64::x15, Imm(hv_register_array_offset)); // Per-thread context save
c.adr(a64::x15, hv_ctx_pc); // x15 = pc
c.mov(a64::x13, a64::sp); // x16 = sp
c.stp(a64::x15, a64::x13, arm::Mem(a64::x14));
c.stp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
c.stp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
c.stp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
c.stp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
c.stp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
c.stp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
c.str(a64::x30, arm::Mem(a64::x14, 112));
// Load REG_Base - use absolute jump target to bypass rel jmp range limits
Label exec_addr = c.newLabel();
c.ldr(a64::x19, arm::Mem(exec_addr));
c.mov(a64::x19, Imm(reinterpret_cast<u64>(&vm::g_exec_addr)));
c.ldr(a64::x19, arm::Mem(a64::x19));
// Load PPUThread struct base -> REG_Sp
const arm::GpX ppu_t_base = a64::x20;
c.mov(ppu_t_base, args[0]);
// Load PC
const arm::GpX pc = a64::x15;
Label cia_offset = c.newLabel();
const arm::GpX cia_addr_reg = a64::x11;
// Load offset value
c.ldr(cia_addr_reg, arm::Mem(cia_offset));
c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
// Load cia
c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
// Multiply by 2 to index into ptr table
@ -343,44 +354,45 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.lsr(call_target, call_target, Imm(16));
// Load registers
Label base_addr = c.newLabel();
c.ldr(a64::x22, arm::Mem(base_addr));
c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
c.ldr(a64::x22, arm::Mem(a64::x22));
Label gpr_addr_offset = c.newLabel();
const arm::GpX gpr_addr_reg = a64::x9;
c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
c.mov(gpr_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::gpr))));
c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
// GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code.
// Injected stack frames also work, but are not free and are completely unnecessary.
c.sub(a64::sp, a64::sp, Imm(4096));
// Execute LLE call
c.blr(call_target);
// Restore registers from the stack
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
// Restore stack ptr
c.add(a64::sp, a64::sp, Imm(112));
// Return
c.ret(a64::x30);
// Return address after far jump. Reset sp and start unwinding...
c.bind(hv_ctx_pc);
c.bind(exec_addr);
c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
c.bind(base_addr);
c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
c.bind(cia_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
c.bind(gpr_addr_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
c.bind(native_sp_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
// Execution guard undo (unneded since we're going to hard-reset the SP)
//c.add(a64::sp, a64::sp, Imm(4096));
// We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg
// Either way, x20 contains our thread base and we forcefully reset the stack pointer
c.add(a64::x14, a64::x20, Imm(hv_register_array_offset)); // Per-thread context save
c.ldr(a64::x15, arm::Mem(a64::x14, 8));
c.ldp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
c.ldp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
c.ldr(a64::x30, arm::Mem(a64::x14, 112));
// Return
c.mov(a64::sp, a64::x15);
c.ret(a64::x30);
#endif
});
@ -390,11 +402,20 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_esc
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#else
// We really shouldn't be using this, but an implementation shoudln't hurt
// Far jump return. Only clobbers x30.
const arm::GpX ppu_t_base = a64::x20;
const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
c.mov(ppu_t_base, args[0]);
c.mov(a64::x30, Imm(hv_register_array_offset));
c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30));
c.ret(a64::x30);
#endif
});
@ -2265,6 +2286,9 @@ void ppu_thread::exec_task()
{
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{
// HVContext push to allow recursion. This happens with guest callback invocations.
const auto old_hv_ctx = hv_ctx;
while (true)
{
if (state) [[unlikely]]
@ -2276,6 +2300,8 @@ void ppu_thread::exec_task()
ppu_gateway(this);
}
// HVContext pop
hv_ctx = old_hv_ctx;
return;
}
@ -2314,6 +2340,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
{
prio.raw().prio = _prio;
memset(&hv_ctx, 0, sizeof(hv_ctx));
gpr[1] = stack_addr + stack_size - ppu_stack_start_offset;
gpr[13] = param.tls_addr;
@ -3502,7 +3530,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
if (notify)
{
bool notified = false;
bool notified = false;
if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128))
{
@ -5277,12 +5305,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
// Translate
if (const auto func = translator.Translate(module_part.funcs[fi]))
{
#ifdef ARCH_X64 // TODO
// Run optimization passes
#if LLVM_VERSION_MAJOR < 17
pm.run(*func);
#else
fpm.run(*func, fam);
#endif
#endif // ARCH_X64
}
else
{
@ -5297,12 +5327,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
{
if (const auto func = translator.GetSymbolResolver(whole_module))
{
#ifdef ARCH_X64 // TODO
// Run optimization passes
#if LLVM_VERSION_MAJOR < 17
pm.run(*func);
#else
fpm.run(*func, fam);
#endif
#endif // ARCH_X64
}
else
{

View File

@ -1,6 +1,7 @@
#pragma once
#include "../CPU/CPUThread.h"
#include "../CPU/Hypervisor.h"
#include "../Memory/vm_ptr.h"
#include "Utilities/lockless.h"
#include "Utilities/BitField.h"
@ -163,6 +164,9 @@ public:
using cpu_thread::operator=;
// Hypervisor context data
alignas(16) rpcs3::hypervisor_context_t hv_ctx; // HV context for gate enter exit. Keep at a low struct offset.
u64 gpr[32] = {}; // General-Purpose Registers
f64 fpr[32] = {}; // Floating Point Registers
v128 vr[32] = {}; // Vector Registers

View File

@ -208,8 +208,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease);
// Create tail call to the check function
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
m_ir->CreateRetVoid();
VMEscape(Call(GetType<void>(), "__check", m_thread, GetAddr()));
}
else
{
@ -321,7 +320,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
if (vec_addrs.empty())
{
// Possible special case for no functions (allowing the do-while optimization)
m_ir->CreateRetVoid();
m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path.
replace_intrinsics(*m_function);
return m_function;
}
@ -378,7 +377,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
// Set insertion point to afterloop_block
m_ir->SetInsertPoint(after_loop);
m_ir->CreateRetVoid();
m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here
replace_intrinsics(*m_function);
return m_function;
@ -482,8 +481,8 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
if (_target >= u32{umax})
{
Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
VMEscape(c);
return;
}
else if (_target >= caddr && _target <= cend)
@ -565,7 +564,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)});
c->setTailCallKind(llvm::CallInst::TCK_Tail);
c->setCallingConv(CallingConv::GHC);
m_ir->CreateRetVoid();
VMEscape(c);
}
Value* PPUTranslator::RegInit(Value*& local)
@ -779,8 +778,8 @@ void PPUTranslator::TestAborted()
m_ir->SetInsertPoint(vcheck);
// Create tail call to the check function
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), "__check", m_thread, GetAddr());
VMEscape(c);
m_ir->SetInsertPoint(body);
}
@ -2206,16 +2205,14 @@ void PPUTranslator::SC(ppu_opcode_t op)
if (index < 1024)
{
Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
VMEscape(c, true);
return;
}
}
Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
VMEscape(c, true);
}
void PPUTranslator::B(ppu_opcode_t op)
@ -2776,9 +2773,9 @@ void PPUTranslator::LWARX(ppu_opcode_t op)
{
RegStore(Trunc(GetAddr()), m_cia);
FlushRegisters();
Call(GetType<void>(), "__resinterp", m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto inst = Call(GetType<void>(), "__resinterp", m_thread);
VMEscape(inst, true);
return;
}
@ -2928,9 +2925,9 @@ void PPUTranslator::LDARX(ppu_opcode_t op)
{
RegStore(Trunc(GetAddr()), m_cia);
FlushRegisters();
Call(GetType<void>(), "__resinterp", m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto inst = Call(GetType<void>(), "__resinterp", m_thread);
VMEscape(inst, true);
return;
}
@ -4998,9 +4995,8 @@ void PPUTranslator::FCFID(ppu_opcode_t op)
void PPUTranslator::UNK(ppu_opcode_t op)
{
FlushRegisters();
Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
VMEscape(c, true);
}
@ -5279,9 +5275,8 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)
void PPUTranslator::Trap()
{
Call(GetType<void>(), "__trap", m_thread, GetAddr());
//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
m_ir->CreateRetVoid();
auto c = Call(GetType<void>(), "__trap", m_thread, GetAddr());
VMEscape(c);
}
Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi)
@ -5328,6 +5323,42 @@ MDNode* PPUTranslator::CheckBranchProbability(u32 bo)
return nullptr;
}
void PPUTranslator::VMEscape([[maybe_unused]] llvm::CallInst* tail_call, [[maybe_unused]] bool skip_flush)
{
//if (!skip_flush)
{
// Flush
FlushRegisters();
}
#ifdef ARCH_X64
// Optionally flag last call as a tail
if (tail_call)
{
tail_call->setTailCall();
}
// This is actually AMD64 specific but good enough for now
m_ir->CreateRetVoid();
#else
// Validation. Make sure we're escaping from a correct context. Only guest JIT should ever go through the "escape" gate.
const auto bb = m_ir->GetInsertPoint();
const auto arg = llvm::dyn_cast<llvm::Argument>(m_thread);
ensure(bb->getParent()->getName().str() == arg->getParent()->getName().str());
const u32 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
const std::string asm_ = fmt::format(
"ldr x20, $0;\n"
"ldr x30, [x20, #%u];\n",
hv_register_array_offset);
LLVM_ASM(asm_, std::array{ m_thread }, "m", m_ir, m_function->getContext());
m_ir->CreateRetVoid();
#endif
}
void PPUTranslator::build_interpreter()
{
#define BUILD_VEC_INST(i) { \
@ -5343,8 +5374,7 @@ void PPUTranslator::build_interpreter()
op.vb = 2; \
op.vc = 3; \
this->i(op); \
FlushRegisters(); \
m_ir->CreateRetVoid(); \
VMEscape(); \
replace_intrinsics(*m_function); \
}

View File

@ -150,6 +150,9 @@ public:
// Emit function call
void CallFunction(u64 target, llvm::Value* indirect = nullptr);
// Emit escape sequence back to hypervisor
void VMEscape(llvm::CallInst* tail_call = nullptr, bool skip_flush = false);
// Emit state check mid-block
void TestAborted();