Complete PPU support

This commit is contained in:
kd-11 2024-08-05 06:17:33 +03:00 committed by kd-11
parent 34549445a8
commit cba658baba
5 changed files with 174 additions and 72 deletions

View File

@ -2,6 +2,21 @@
#include "AArch64JIT.h"
#include "../Hypervisor.h"
LOG_CHANNEL(jit_log, "JIT");
#define STDOUT_DEBUG
#ifndef STDOUT_DEBUG
#define DPRINT jit_log.trace
#else
#define DPRINT(...)\
do {\
printf(__VA_ARGS__);\
printf("\n");\
fflush(stdout);\
} while (0)
#endif
namespace aarch64
{
// FIXME: This really should be part of fmt
@ -23,11 +38,11 @@ namespace aarch64
using function_info_t = GHC_frame_preservation_pass::function_info_t;
GHC_frame_preservation_pass::GHC_frame_preservation_pass(
gpr base_reg,
u32 hv_ctx_offset,
const std::vector<std::pair<std::string, gpr>>& base_register_lookup,
std::function<bool(const std::string&)> exclusion_callback)
{
execution_context.base_register = base_reg;
execution_context.base_register_lookup = base_register_lookup;
execution_context.hypervisor_context_offset = hv_ctx_offset;
this->exclusion_callback = exclusion_callback;
}
@ -118,6 +133,13 @@ namespace aarch64
instruction_info_t result{};
if (auto ci = llvm::dyn_cast<llvm::CallInst>(i))
{
// Watch out for injected ASM blocks...
if (llvm::isa<llvm::InlineAsm>(ci->getCalledOperand()))
{
// Not a real call. This is just an insert of inline asm
return result;
}
result.is_call_inst = true;
result.is_returning = true;
result.preserve_stack = !ci->isTailCall();
@ -126,12 +148,15 @@ namespace aarch64
if (!result.callee)
{
// TODO: What are these?????? Patchpoints maybe? Need to check again
result.is_call_inst = f.getName() == "__spu-null";
// Indirect call (call from raw value).
result.is_indirect = true;
result.callee_is_GHC = ci->getCallingConv() == llvm::CallingConv::GHC;
result.callee_name = "__indirect_call";
}
else
{
result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC;
result.callee_name = result.callee->getName().str();
}
return result;
}
@ -145,7 +170,8 @@ namespace aarch64
auto targetbb = bi->getSuccessor(0);
result.callee = targetbb->getParent();
result.is_call_inst = result.callee->getName() != f.getName();
result.callee_name = result.callee->getName().str();
result.is_call_inst = result.callee_name != f.getName();
}
return result;
@ -155,10 +181,11 @@ namespace aarch64
{
// Very unlikely to be the same function. Can be considered a function exit.
ensure(bi->getNumDestinations() == 1);
auto targetbb = bi->getSuccessor(0);
auto targetbb = ensure(bi->getSuccessor(0)); // This is guaranteed to fail but I've yet to encounter this
result.callee = targetbb->getParent();
result.is_call_inst = result.callee->getName() != f.getName();
result.callee_name = result.callee->getName().str();
result.is_call_inst = result.callee_name != f.getName();
return result;
}
@ -168,7 +195,8 @@ namespace aarch64
auto targetbb = bi->getSuccessor(0);
result.callee = targetbb->getParent();
result.is_call_inst = result.callee->getName() != f.getName();
result.callee_name = result.callee->getName().str();
result.is_call_inst = result.callee_name != f.getName();
return result;
}
@ -178,13 +206,29 @@ namespace aarch64
auto targetbb = bi->getSuccessor(0);
result.callee = targetbb->getParent();
result.is_call_inst = result.callee->getName() != f.getName();
result.callee_name = result.callee->getName().str();
result.is_call_inst = result.callee_name != f.getName();
return result;
}
return result;
}
gpr GHC_frame_preservation_pass::get_base_register_for_call(const std::string& callee_name)
{
// We go over the base_register_lookup table and find the first matching pattern
for (const auto& pattern : execution_context.base_register_lookup)
{
if (callee_name.starts_with(pattern.first))
{
return pattern.second;
}
}
// Default is x19
return aarch64::x19;
}
void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f)
{
if (f.getCallingConv() != llvm::CallingConv::GHC)
@ -200,6 +244,14 @@ namespace aarch64
}
const auto this_name = f.getName().str();
if (visited_functions.find(this_name) != visited_functions.end())
{
// Already processed. Only useful when recursing which is currently not used.
DPRINT("Function %s was already processed. Skipping.\n", this_name.c_str());
return;
}
visited_functions.insert(this_name);
if (exclusion_callback && exclusion_callback(this_name))
{
// Function is explicitly excluded
@ -220,14 +272,6 @@ namespace aarch64
// Asm snippets for patching stack frame
std::string frame_prologue, frame_epilogue;
// Return address reload on exit. This is safer than trying to stuff things into the stack frame since the size is largely just guesswork at this time.
std::string x30_tail_restore = fmt::format(
"mov x30, #%u;\n" // Load offset to last gateway exit
"add x30, x%u, x30;\n" // Add to base register
"ldr x30, [x30];\n", // Load x30
execution_context.hypervisor_context_offset,
static_cast<u32>(execution_context.base_register));
if (function_info.stack_frame_size > 0)
{
// NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway.
@ -235,8 +279,12 @@ namespace aarch64
frame_prologue = fmt::format("sub sp, sp, #%u;", function_info.stack_frame_size);
frame_epilogue = fmt::format("add sp, sp, #%u;", function_info.stack_frame_size);
// Emit the frame prologue
LLVM_ASM_0(frame_prologue, irb, f.getContext());
// Emit the frame prologue. We use a BB here for extra safety as it solves the problem of backwards jumps re-executing the prologue.
auto functionStart = &f.front();
auto prologueBB = llvm::BasicBlock::Create(f.getContext(), "", &f, functionStart);
irb->SetInsertPoint(prologueBB, prologueBB->begin());
LLVM_ASM_VOID(frame_prologue, irb, f.getContext());
irb->CreateBr(functionStart);
}
// Now we start processing
@ -259,7 +307,6 @@ namespace aarch64
if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm."))
{
// Always inlined call. Likely inline Asm. Skip
// log("Function %s will ignore call to intrinsic function %s\n", this_name.c_str(), callee_name.c_str());
++bit;
continue;
}
@ -278,48 +325,62 @@ namespace aarch64
if (function_info.stack_frame_size > 0)
{
// 1. Nuke all scratch
LLVM_ASM_0(frame_epilogue, irb, f.getContext());
// 1. Nuke the local stack frame if any
LLVM_ASM_VOID(frame_epilogue, irb, f.getContext());
}
if (function_info.clobbers_x30)
{
// 2. Restore the gateway as the current return address
LLVM_ASM_0(x30_tail_restore, irb, f.getContext());
}
// 3. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only.
// 2. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only.
// Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting.
// This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated.
// No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away.
std::string exit_fn;
auto ci = ensure(llvm::dyn_cast<llvm::CallInst>(original_inst));
auto operand_count = ci->getNumOperands();
auto operand_count = ci->getNumOperands() - 1; // The last operand is the callee, not a real operand
std::vector<std::string> constraints;
std::vector<llvm::Value*> args;
// We now load the callee args.
// FIXME: This is often times redundant and wastes cycles, we'll clean this up in a MachineFunction pass later.
int base_reg = execution_context.base_register;
int args_base_reg = instruction_info.callee_is_GHC ? aarch64::x19 : aarch64::x0; // GHC args are always x19..x25
for (unsigned i = 0; i < operand_count; ++i)
{
args.push_back(ci->getOperand(i));
exit_fn += fmt::format("mov x%d, $%u;\n", base_reg++, i);
exit_fn += fmt::format("mov x%d, $%u;\n", args_base_reg++, i);
constraints.push_back("r");
}
std::copy(ci->operands().begin(), ci->operands().end(), args.begin());
auto context_base_reg = get_base_register_for_call(instruction_info.callee_name);
if (!instruction_info.callee_is_GHC)
{
// For non-GHC calls, we have to remap the arguments to x0...
context_base_reg = static_cast<gpr>(context_base_reg - 19);
}
if (function_info.clobbers_x30)
{
// 3. Restore the exit gate as the current return address
// We want to do this after loading the arguments in case there was any spilling involved.
DPRINT("Patching call from %s to %s on register %d...",
this_name.c_str(),
instruction_info.callee_name.c_str(),
static_cast<int>(context_base_reg));
const auto x30_tail_restore = fmt::format(
"ldr x30, [x%u, #%u];\n", // Load x30 from thread context
static_cast<u32>(context_base_reg),
execution_context.hypervisor_context_offset);
exit_fn += x30_tail_restore;
}
auto target = ensure(ci->getCalledOperand());
args.push_back(target);
if (ci->isIndirectCall())
if (instruction_info.is_indirect)
{
constraints.push_back("r");
exit_fn += fmt::format(
"mov x15, $%u;\n"
"br x15",
operand_count);
exit_fn += fmt::format("br $%u;\n", operand_count);
}
else
{
@ -328,7 +389,7 @@ namespace aarch64
}
// Emit the branch
LLVM_ASM(exit_fn, args, join_strings(constraints, ","), irb, f.getContext());
llvm_asm(irb, exit_fn, args, join_strings(constraints, ","), f.getContext());
// Delete original call instruction
bit = ci->eraseFromParent();

View File

@ -39,14 +39,16 @@ namespace aarch64
bool is_returning; // This instruction "returns" to the next instruction (typically just llvm::CallInst*)
bool callee_is_GHC; // The other function is GHC
bool is_tail_call; // Tail call. Assume it is an exit/terminator.
bool is_indirect; // Indirect call. Target is the first operand.
llvm::Function* callee; // Callee if any
std::string callee_name; // Name of the callee.
};
protected:
std::unordered_set<std::string> visited_functions;
struct
{
gpr base_register;
std::vector<std::pair<std::string, gpr>> base_register_lookup;
u32 hypervisor_context_offset;
} execution_context;
@ -57,11 +59,13 @@ namespace aarch64
function_info_t preprocess_function(llvm::Function& f);
instruction_info_t decode_instruction(llvm::Function& f, llvm::Instruction* i);
gpr get_base_register_for_call(const std::string& callee_name);
public:
GHC_frame_preservation_pass(
gpr base_reg,
u32 hv_ctx_offset,
const std::vector<std::pair<std::string, gpr>>& base_register_lookup = {},
std::function<bool(const std::string&)> exclusion_callback = {});
~GHC_frame_preservation_pass() = default;

View File

@ -3089,6 +3089,9 @@ protected:
void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine);
// Run intrinsics replacement pass
void replace_intrinsics(llvm::Function&);
public:
// Register a transformation pass to be run before final compilation by llvm
void register_transform_pass(std::unique_ptr<translator_pass>& pass)
@ -3797,9 +3800,6 @@ public:
}
}
// Run intrinsics replacement pass
void replace_intrinsics(llvm::Function&);
// Finalize processing
void run_transforms(llvm::Function&);
@ -3935,25 +3935,39 @@ llvm::InlineAsm* compile_inline_asm(
}
// Helper for ASM generation with dynamic number of arguments
static inline
llvm::CallInst* llvm_asm(
llvm::IRBuilder<>* irb,
std::string& asm_,
llvm::ArrayRef<llvm::Value*> args,
const std::string& constraints,
llvm::LLVMContext& context)
{
llvm::ArrayRef<llvm::Type*> types_ref = std::nullopt;
std::vector<llvm::Type*> types;
types.reserve(args.size());
if (!args.empty())
{
for (const auto& arg : args)
{
types.push_back(arg->getType());
}
types_ref = types;
}
auto return_type = llvm::Type::getVoidTy(context);
auto callee = compile_inline_asm(return_type, types_ref, asm_, constraints);
auto c = irb->CreateCall(callee, args);
c->addFnAttr(llvm::Attribute::AlwaysInline);
return c;
}
#define LLVM_ASM(asm_, args, constraints, irb, ctx)\
do {\
std::vector<llvm::Type*> _argTypes;\
_argTypes.reserve(args.size());\
for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\
auto _returnType = llvm::Type::getVoidTy(ctx); \
llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \
auto _c = irb->CreateCall(_callee, args); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
llvm_asm(irb, asm_, args, constraints, ctx)
// Helper for ASM generation with 0 args
#define LLVM_ASM_0(asm_, irb, ctx)\
do {\
const auto _voidTy = llvm::Type::getVoidTy(ctx); \
auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \
auto _c = irb->CreateCall(_callee); \
_c->setTailCall(); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
#define LLVM_ASM_VOID(asm_, irb, ctx)\
llvm_asm(irb, asm_, {}, "", ctx)
#endif

View File

@ -371,8 +371,16 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
// GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code.
// Injected stack frames also work, but are not free and are completely unnecessary.
// Thread context save. This is needed for PPU because different functions can switch between x19 and x20 for the base register.
// We need a different solution to ensure that no matter which version, we get the right vaue on far return.
c.mov(a64::x26, ppu_t_base);
// Save thread pointer to stack. SP is the only register preserved across GHC calls.
c.sub(a64::sp, a64::sp, Imm(16));
c.str(a64::x20, arm::Mem(a64::sp));
// GHC scratchpad mem. If managed correctly (i.e no returns ever), GHC functions should never require a stack frame.
// We allocate a slab to use for all functions as they tail-call into each other.
c.sub(a64::sp, a64::sp, Imm(4096));
// Execute LLE call
@ -381,11 +389,14 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
// Return address after far jump. Reset sp and start unwinding...
c.bind(hv_ctx_pc);
// Execution guard undo (unneded since we're going to hard-reset the SP)
//c.add(a64::sp, a64::sp, Imm(4096));
// Clear scratchpad allocation
c.add(a64::sp, a64::sp, Imm(4096));
c.ldr(a64::x20, arm::Mem(a64::sp));
c.add(a64::sp, a64::sp, Imm(16));
// We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg
// Either way, x20 contains our thread base and we forcefully reset the stack pointer
// Either way, x26 contains our thread base and we forcefully reset the stack pointer
c.add(a64::x14, a64::x20, Imm(hv_register_array_offset)); // Per-thread context save
c.ldr(a64::x15, arm::Mem(a64::x14, 8));

View File

@ -36,9 +36,21 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
// Initialize transform passes
#ifdef ARCH_ARM64
std::unique_ptr<translator_pass> ghc_fixup_pass = std::make_unique<aarch64::GHC_frame_preservation_pass>(
aarch64::x20, ::offset32(&ppu_thread::hv_ctx));
// Base reg table definition
// Assume all functions named __0x... are PPU functions and take the m_exec as the first arg
std::vector<std::pair<std::string, aarch64::gpr>> base_reg_lookup = {
{ "__0x", aarch64::x20 }, // PPU blocks
{ "__indirect", aarch64::x20 }, // Indirect jumps
{ "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway)
{ "__", aarch64::x19 } // Probably link table entries
};
// Create transform pass
std::unique_ptr<translator_pass> ghc_fixup_pass = std::make_unique<aarch64::GHC_frame_preservation_pass>(
::offset32(&ppu_thread::hv_ctx),
base_reg_lookup);
// Register it
register_transform_pass(ghc_fixup_pass);
#endif
@ -282,7 +294,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
}
}
replace_intrinsics(*m_function);
run_transforms(*m_function);
return m_function;
}
@ -334,7 +346,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
{
// Possible special case for no functions (allowing the do-while optimization)
m_ir->CreateRetVoid();
replace_intrinsics(*m_function);
run_transforms(*m_function);
return m_function;
}
@ -392,7 +404,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
m_ir->CreateRetVoid();
replace_intrinsics(*m_function);
run_transforms(*m_function);
return m_function;
}
@ -5357,7 +5369,7 @@ void PPUTranslator::build_interpreter()
this->i(op); \
FlushRegisters(); \
m_ir->CreateRetVoid(); \
replace_intrinsics(*m_function); \
run_transforms(*m_function); \
}
BUILD_VEC_INST(VADDCUW);