From 7979c5d9eb0a06887f5070d24a4b4fc187b7805e Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 5 Aug 2024 20:51:02 +0300 Subject: [PATCH] Finalize PPU migration to the frame pass system --- rpcs3/Emu/CPU/Backends/AArch64JIT.cpp | 70 +++++++++++++++------------ rpcs3/Emu/CPU/Backends/AArch64JIT.h | 21 ++++---- rpcs3/Emu/CPU/CPUTranslator.h | 2 +- rpcs3/Emu/Cell/PPUTranslator.cpp | 37 ++++++++------ 4 files changed, 74 insertions(+), 56 deletions(-) diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp index 6f486a578d..73aad317a0 100644 --- a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp @@ -4,17 +4,17 @@ LOG_CHANNEL(jit_log, "JIT"); -#define STDOUT_DEBUG +#define STDOUT_DEBUG 0 -#ifndef STDOUT_DEBUG -#define DPRINT jit_log.trace -#else +#if STDOUT_DEBUG #define DPRINT(...)\ do {\ printf(__VA_ARGS__);\ printf("\n");\ fflush(stdout);\ } while (0) +#else +#define DPRINT jit_log.trace #endif namespace aarch64 @@ -37,15 +37,9 @@ namespace aarch64 using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t; using function_info_t = GHC_frame_preservation_pass::function_info_t; - GHC_frame_preservation_pass::GHC_frame_preservation_pass( - u32 hv_ctx_offset, - const std::vector>& base_register_lookup, - std::function exclusion_callback) - { - execution_context.base_register_lookup = base_register_lookup; - execution_context.hypervisor_context_offset = hv_ctx_offset; - this->exclusion_callback = exclusion_callback; - } + GHC_frame_preservation_pass::GHC_frame_preservation_pass(const config_t& configuration) + : execution_context(configuration) + {} void GHC_frame_preservation_pass::reset() { @@ -94,23 +88,29 @@ namespace aarch64 return result; } - // Stack frame estimation. SPU code can be very long and consumes several KB of stack. - u32 stack_frame_size = 128u; - // Actual ratio is usually around 1:4 - const u32 expected_compiled_instr_count = f.getInstructionCount() * 4; - // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs. - // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions. - // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead. - // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level. - const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u; - stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes + if (execution_context.use_stack_frames) + { + // Stack frame estimation. SPU code can be very long and consumes several KB of stack. + u32 stack_frame_size = 128u; + // Actual ratio is usually around 1:4 + const u32 expected_compiled_instr_count = f.getInstructionCount() * 4; + // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs. + // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions. + // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead. + // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level. + const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u; + stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes + + result.stack_frame_size = stack_frame_size; + } - result.stack_frame_size = stack_frame_size; result.instruction_count = f.getInstructionCount(); result.num_external_calls = 0; // The LR is not spared by LLVM in cases where there is a lot of spilling. - // This is another thing to be moved to a MachineFunction pass. + // This is much easier to manage with a custom LLVM branch as we can just mark X30 as off-limits as a GPR. + // This is another thing to be moved to a MachineFunction pass. Ideally we should check the instruction stream for writes to LR and reload it on exit. + // For now, assume it is dirtied if the function is of any reasonable length. result.clobbers_x30 = result.instruction_count > 32; for (auto& bb : f) @@ -323,13 +323,7 @@ namespace aarch64 llvm::Instruction* original_inst = llvm::dyn_cast(bit); irb->SetInsertPoint(ensure(llvm::dyn_cast(bit))); - if (function_info.stack_frame_size > 0) - { - // 1. Nuke the local stack frame if any - LLVM_ASM_VOID(frame_epilogue, irb, f.getContext()); - } - - // 2. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. + // We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting. // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated. // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away. @@ -374,6 +368,18 @@ namespace aarch64 exit_fn += x30_tail_restore; } + // Stack cleanup. We need to do this last to allow the spiller to find it's own spilled variables. + if (function_info.stack_frame_size > 0) + { + exit_fn += frame_epilogue; + } + + if (execution_context.debug_info) + { + // Store x27 as our current address taking the place of LR (for debugging since bt is now useless) + exit_fn += "adr x27, .;\n"; + } + auto target = ensure(ci->getCalledOperand()); args.push_back(target); diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64JIT.h index e5a8958d63..fc8064f1d5 100644 --- a/rpcs3/Emu/CPU/Backends/AArch64JIT.h +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.h @@ -43,14 +43,20 @@ namespace aarch64 llvm::Function* callee; // Callee if any std::string callee_name; // Name of the callee. }; + + struct config_t + { + bool debug_info = false; // Record debug information + bool use_stack_frames = true; // Allocate a stack frame for each function. The gateway can alternatively manage a global stack to use as scratch. + u32 hypervisor_context_offset = 0; // Offset within the "thread" object where we can find the hypervisor context (registers configured at gateway). + std::function exclusion_callback; // [Optional] Callback run on each function before transform. Return "true" to exclude from frame processing. + std::vector> base_register_lookup; // [Optional] Function lookup table to determine the location of the "thread" context. + }; + protected: std::unordered_set visited_functions; - struct - { - std::vector> base_register_lookup; - u32 hypervisor_context_offset; - } execution_context; + config_t execution_context; std::function exclusion_callback; @@ -63,10 +69,7 @@ namespace aarch64 gpr get_base_register_for_call(const std::string& callee_name); public: - GHC_frame_preservation_pass( - u32 hv_ctx_offset, - const std::vector>& base_register_lookup = {}, - std::function exclusion_callback = {}); + GHC_frame_preservation_pass(const config_t& configuration); ~GHC_frame_preservation_pass() = default; void run(llvm::IRBuilder<>* irb, llvm::Function& f) override; diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 91d14033ac..bfaa14ecaa 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3938,7 +3938,7 @@ llvm::InlineAsm* compile_inline_asm( static inline llvm::CallInst* llvm_asm( llvm::IRBuilder<>* irb, - std::string& asm_, + const std::string& asm_, llvm::ArrayRef args, const std::string& constraints, llvm::LLVMContext& context) diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index c9040b1b4d..98bdb83333 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -36,22 +36,31 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo // Initialize transform passes #ifdef ARCH_ARM64 - // Base reg table definition - // Assume all functions named __0x... are PPU functions and take the m_exec as the first arg - std::vector> base_reg_lookup = { - { "__0x", aarch64::x20 }, // PPU blocks - { "__indirect", aarch64::x20 }, // Indirect jumps - { "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway) - { "__", aarch64::x19 } // Probably link table entries - }; + { + // Base reg table definition + // Assume all functions named __0x... are PPU functions and take the m_exec as the first arg + std::vector> base_reg_lookup = { + { "__0x", aarch64::x20 }, // PPU blocks + { "__indirect", aarch64::x20 }, // Indirect jumps + { "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway) + { "__", aarch64::x19 } // Probably link table entries + }; - // Create transform pass - std::unique_ptr ghc_fixup_pass = std::make_unique( - ::offset32(&ppu_thread::hv_ctx), - base_reg_lookup); + aarch64::GHC_frame_preservation_pass::config_t config = + { + .debug_info = false, // Set to "true" to insert debug frames on x27 + .use_stack_frames = false, // GW allocates 4k of scratch on the stack + .hypervisor_context_offset = ::offset32(&ppu_thread::hv_ctx), + .exclusion_callback = {}, // Unused, we don't have special exclusion functions on PPU + .base_register_lookup = base_reg_lookup + }; - // Register it - register_transform_pass(ghc_fixup_pass); + // Create transform pass + std::unique_ptr ghc_fixup_pass = std::make_unique(config); + + // Register it + register_transform_pass(ghc_fixup_pass); + } #endif // Thread context struct (TODO: safer member access)