From 7979c5d9eb0a06887f5070d24a4b4fc187b7805e Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Mon, 5 Aug 2024 20:51:02 +0300
Subject: [PATCH] Finalize PPU migration to the frame pass system

---
 rpcs3/Emu/CPU/Backends/AArch64JIT.cpp | 70 +++++++++++++++------------
 rpcs3/Emu/CPU/Backends/AArch64JIT.h   | 21 ++++----
 rpcs3/Emu/CPU/CPUTranslator.h         |  2 +-
 rpcs3/Emu/Cell/PPUTranslator.cpp      | 37 ++++++++------
 4 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
index 6f486a578d..73aad317a0 100644
--- a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
+++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
@@ -4,17 +4,17 @@
 
 LOG_CHANNEL(jit_log, "JIT");
 
-#define STDOUT_DEBUG
+#define STDOUT_DEBUG 0
 
-#ifndef STDOUT_DEBUG
-#define DPRINT jit_log.trace
-#else
+#if STDOUT_DEBUG
 #define DPRINT(...)\
     do {\
         printf(__VA_ARGS__);\
         printf("\n");\
         fflush(stdout);\
     } while (0)
+#else
+#define DPRINT jit_log.trace
 #endif
 
 namespace aarch64
@@ -37,15 +37,9 @@ namespace aarch64
     using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t;
     using function_info_t = GHC_frame_preservation_pass::function_info_t;
 
-    GHC_frame_preservation_pass::GHC_frame_preservation_pass(
-        u32 hv_ctx_offset,
-        const std::vector<std::pair<std::string, gpr>>& base_register_lookup,
-        std::function<bool(const std::string&)> exclusion_callback)
-    {
-        execution_context.base_register_lookup = base_register_lookup;
-        execution_context.hypervisor_context_offset = hv_ctx_offset;
-        this->exclusion_callback = exclusion_callback;
-    }
+    GHC_frame_preservation_pass::GHC_frame_preservation_pass(const config_t& configuration)
+        : execution_context(configuration)
+    {}
 
     void GHC_frame_preservation_pass::reset()
     {
@@ -94,23 +88,29 @@ namespace aarch64
             return result;
         }
 
-        // Stack frame estimation. SPU code can be very long and consumes several KB of stack.
-        u32 stack_frame_size = 128u;
-        // Actual ratio is usually around 1:4
-        const u32 expected_compiled_instr_count = f.getInstructionCount() * 4;
-        // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs.
-        // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions.
-        // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead.
-        // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level.
-        const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u;
-        stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes
+        if (execution_context.use_stack_frames)
+        {
+            // Stack frame estimation. SPU code can be very long and consumes several KB of stack.
+            u32 stack_frame_size = 128u;
+            // Actual ratio is usually around 1:4
+            const u32 expected_compiled_instr_count = f.getInstructionCount() * 4;
+            // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs.
+            // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions.
+            // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead.
+            // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level.
+            const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u;
+            stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes
+
+            result.stack_frame_size = stack_frame_size;
+        }
 
-        result.stack_frame_size = stack_frame_size;
         result.instruction_count = f.getInstructionCount();
         result.num_external_calls = 0;
 
         // The LR is not spared by LLVM in cases where there is a lot of spilling.
-        // This is another thing to be moved to a MachineFunction pass.
+        // This is much easier to manage with a custom LLVM branch as we can just mark X30 as off-limits as a GPR.
+        // This is another thing to be moved to a MachineFunction pass. Ideally we should check the instruction stream for writes to LR and reload it on exit.
+        // For now, assume it is dirtied if the function is of any reasonable length.
         result.clobbers_x30 = result.instruction_count > 32;
 
         for (auto& bb : f)
@@ -323,13 +323,7 @@ namespace aarch64
                     llvm::Instruction* original_inst = llvm::dyn_cast<llvm::Instruction>(bit);
                     irb->SetInsertPoint(ensure(llvm::dyn_cast<llvm::Instruction>(bit)));
 
-                    if (function_info.stack_frame_size > 0)
-                    {
-                        // 1. Nuke the local stack frame if any
-                        LLVM_ASM_VOID(frame_epilogue, irb, f.getContext());
-                    }
-
-                    // 2. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only.
+                    // We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only.
                     // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting.
                     // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated.
                     // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away.
@@ -374,6 +368,18 @@ namespace aarch64
                         exit_fn += x30_tail_restore;
                     }
 
+                    // Stack cleanup. We need to do this last to allow the spiller to find it's own spilled variables.
+                    if (function_info.stack_frame_size > 0)
+                    {
+                        exit_fn += frame_epilogue;
+                    }
+
+                    if (execution_context.debug_info)
+                    {
+                        // Store x27 as our current address taking the place of LR (for debugging since bt is now useless)
+                        exit_fn += "adr x27, .;\n";
+                    }
+
                     auto target = ensure(ci->getCalledOperand());
                     args.push_back(target);
 
diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64JIT.h
index e5a8958d63..fc8064f1d5 100644
--- a/rpcs3/Emu/CPU/Backends/AArch64JIT.h
+++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.h
@@ -43,14 +43,20 @@ namespace aarch64
             llvm::Function* callee;   // Callee if any
             std::string callee_name;  // Name of the callee.
         };
+
+        struct config_t
+        {
+            bool debug_info = false;         // Record debug information
+            bool use_stack_frames = true;    // Allocate a stack frame for each function. The gateway can alternatively manage a global stack to use as scratch.
+            u32 hypervisor_context_offset = 0; // Offset within the "thread" object where we can find the hypervisor context (registers configured at gateway).
+            std::function<bool(const std::string&)> exclusion_callback;    // [Optional] Callback run on each function before transform. Return "true" to exclude from frame processing.
+            std::vector<std::pair<std::string, gpr>> base_register_lookup; // [Optional] Function lookup table to determine the location of the "thread" context.
+        };
+
     protected:
         std::unordered_set<std::string> visited_functions;
 
-        struct
-        {
-            std::vector<std::pair<std::string, gpr>> base_register_lookup;
-            u32  hypervisor_context_offset;
-        } execution_context;
+        config_t execution_context;
 
         std::function<bool(const std::string&)> exclusion_callback;
 
@@ -63,10 +69,7 @@ namespace aarch64
         gpr get_base_register_for_call(const std::string& callee_name);
     public:
 
-        GHC_frame_preservation_pass(
-            u32 hv_ctx_offset,
-            const std::vector<std::pair<std::string, gpr>>& base_register_lookup = {},
-            std::function<bool(const std::string&)> exclusion_callback = {});
+        GHC_frame_preservation_pass(const config_t& configuration);
         ~GHC_frame_preservation_pass() = default;
 
         void run(llvm::IRBuilder<>* irb, llvm::Function& f) override;
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index 91d14033ac..bfaa14ecaa 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -3938,7 +3938,7 @@ llvm::InlineAsm* compile_inline_asm(
 static inline
 llvm::CallInst* llvm_asm(
 	llvm::IRBuilder<>* irb,
-	std::string& asm_,
+	const std::string& asm_,
 	llvm::ArrayRef<llvm::Value*> args,
 	const std::string& constraints,
 	llvm::LLVMContext& context)
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
index c9040b1b4d..98bdb83333 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -36,22 +36,31 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 
 	// Initialize transform passes
 #ifdef ARCH_ARM64
-	// Base reg table definition
-	// Assume all functions named __0x... are PPU functions and take the m_exec as the first arg
-	std::vector<std::pair<std::string, aarch64::gpr>> base_reg_lookup = {
-		{ "__0x", aarch64::x20 }, // PPU blocks
-		{ "__indirect", aarch64::x20 }, // Indirect jumps
-		{ "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway)
-		{ "__", aarch64::x19 }    // Probably link table entries
-	};
+	{
+		// Base reg table definition
+		// Assume all functions named __0x... are PPU functions and take the m_exec as the first arg
+		std::vector<std::pair<std::string, aarch64::gpr>> base_reg_lookup = {
+			{ "__0x", aarch64::x20 }, // PPU blocks
+			{ "__indirect", aarch64::x20 }, // Indirect jumps
+			{ "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway)
+			{ "__", aarch64::x19 }    // Probably link table entries
+		};
 
-	// Create transform pass
-	std::unique_ptr<translator_pass> ghc_fixup_pass = std::make_unique<aarch64::GHC_frame_preservation_pass>(
-		::offset32(&ppu_thread::hv_ctx),
-		base_reg_lookup);
+		aarch64::GHC_frame_preservation_pass::config_t config =
+		{
+			.debug_info = false,         // Set to "true" to insert debug frames on x27
+			.use_stack_frames = false,   // GW allocates 4k of scratch on the stack
+			.hypervisor_context_offset = ::offset32(&ppu_thread::hv_ctx),
+			.exclusion_callback = {},    // Unused, we don't have special exclusion functions on PPU
+			.base_register_lookup = base_reg_lookup
+		};
 
-	// Register it
-	register_transform_pass(ghc_fixup_pass);
+		// Create transform pass
+		std::unique_ptr<translator_pass> ghc_fixup_pass = std::make_unique<aarch64::GHC_frame_preservation_pass>(config);
+
+		// Register it
+		register_transform_pass(ghc_fixup_pass);
+	}
 #endif
 
 	// Thread context struct (TODO: safer member access)