Initial PPU LLVM implementation for aarch64

2025-03-15 22:21:25 +00:00 · 2024-08-04 05:09:06 +03:00 · 2024-08-04 05:09:06 +03:00 · 56cc5d9355
commit 56cc5d9355
parent a5f9256ac6
6 changed files with 223 additions and 78 deletions
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -26,6 +26,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/InlineAsm.h"

 #ifdef _MSC_VER
 #pragma warning(pop)
@ -3898,4 +3899,39 @@ struct fmt_unveil<llvm::TypeSize, void>
 	}
 };

+// Inline assembly wrappers.
+// TODO: Move these to proper location and replace macros with templates
+static inline
+llvm::InlineAsm* compile_inline_asm(
+	llvm::Type* returnType,
+	llvm::ArrayRef<llvm::Type*> argTypes,
+	const std::string& code,
+	const std::string& constraints)
+{
+	const auto callSig = llvm::FunctionType::get(returnType, argTypes, false);
+	return llvm::InlineAsm::get(callSig, code, constraints, true, false);
+}
+
+// Helper for ASM generation with dynamic number of arguments
+#define LLVM_ASM(asm_, args, constraints, irb, ctx)\
+	do {\
+		std::vector<llvm::Type*> _argTypes;\
+		_argTypes.reserve(args.size());\
+		for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\
+		auto _returnType = llvm::Type::getVoidTy(ctx); \
+		llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \
+		auto _c = irb->CreateCall(_callee, args); \
+		_c->addFnAttr(llvm::Attribute::AlwaysInline); \
+	} while(0)
+
+// Helper for ASM generation with 0 args
+#define LLVM_ASM_0(asm_, irb, ctx)\
+	do {\
+		const auto _voidTy = llvm::Type::getVoidTy(ctx); \
+		auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \
+		auto _c = irb->CreateCall(_callee); \
+		_c->setTailCall(); \
+		_c->addFnAttr(llvm::Attribute::AlwaysInline); \
+	} while(0)
+
 #endif
--- a/rpcs3/Emu/CPU/Hypervisor.h
+++ b/rpcs3/Emu/CPU/Hypervisor.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <util/types.hpp>
+
+namespace rpcs3
+{
+	union alignas(16) hypervisor_context_t
+	{
+		u64 regs[16];
+
+		struct
+		{
+			u64 pc;
+			u64 sp;
+
+			u64 x18;
+			u64 x19;
+			u64 x20;
+			u64 x21;
+			u64 x22;
+			u64 x23;
+			u64 x24;
+			u64 x25;
+			u64 x26;
+			u64 x27;
+			u64 x28;
+			u64 x29;
+			u64 x30;
+
+			// x0-x17 unused
+		} aarch64;
+
+		struct
+		{
+			u64 sp;
+
+			// Other regs unused
+		} x86;
+	};
+}
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -222,7 +222,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 #endif

 	// Save native stack pointer for longjmp emulation
-	c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp);
+	c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);

 	// Initialize args
 	c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
@ -291,37 +291,48 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
 	// for AArch64 calling convention

-	// Save sp for native longjmp emulation
-	Label native_sp_offset = c.newLabel();
-	c.ldr(a64::x10, arm::Mem(native_sp_offset));
-	// sp not allowed to be used in load/stores directly
-	c.mov(a64::x15, a64::sp);
-	c.str(a64::x15, arm::Mem(args[0], a64::x10));
-
-	// Push callee saved registers to the stack
+	// Push callee saved registers to the hv context
+	// Assume our LLVM compiled code is unsafe and can clobber our stack. GHC on aarch64 treats stack as scratch.
+	// We also want to store the register context at a fixed place so we can read the hypervisor state from any lcoation.
 	// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
-	c.sub(a64::sp, a64::sp, Imm(112));
-	c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
-	c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
-	c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
-	c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
-	c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
-	c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
-	c.str(a64::x30, arm::Mem(a64::sp, 96));
+
+	// Pre-context save
+	// Layout:
+	// pc, sp
+	// x18, x19...x30
+	// NOTE: Do not touch x19..x30 before saving the registers!
+	const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+	Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address
+
+	// Sanity
+	ensure(hv_register_array_offset < 4096); // Imm10
+
+	c.mov(a64::x15, args[0]);
+	c.add(a64::x14, a64::x15, Imm(hv_register_array_offset));  // Per-thread context save
+
+	c.adr(a64::x15, hv_ctx_pc); // x15 = pc
+	c.mov(a64::x13, a64::sp);   // x16 = sp
+
+	c.stp(a64::x15, a64::x13, arm::Mem(a64::x14));
+	c.stp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
+	c.stp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
+	c.stp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
+	c.stp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
+	c.stp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
+	c.stp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
+	c.str(a64::x30, arm::Mem(a64::x14, 112));

 	// Load REG_Base - use absolute jump target to bypass rel jmp range limits
-	Label exec_addr = c.newLabel();
-	c.ldr(a64::x19, arm::Mem(exec_addr));
+	c.mov(a64::x19, Imm(reinterpret_cast<u64>(&vm::g_exec_addr)));
 	c.ldr(a64::x19, arm::Mem(a64::x19));
 	// Load PPUThread struct base -> REG_Sp
 	const arm::GpX ppu_t_base = a64::x20;
 	c.mov(ppu_t_base, args[0]);
 	// Load PC
 	const arm::GpX pc = a64::x15;
-	Label cia_offset = c.newLabel();
 	const arm::GpX cia_addr_reg = a64::x11;
 	// Load offset value
-	c.ldr(cia_addr_reg, arm::Mem(cia_offset));
+	c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
 	// Load cia
 	c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
 	// Multiply by 2 to index into ptr table
@ -343,44 +354,45 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	c.lsr(call_target, call_target, Imm(16));

 	// Load registers
-	Label base_addr = c.newLabel();
-	c.ldr(a64::x22, arm::Mem(base_addr));
+	c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
 	c.ldr(a64::x22, arm::Mem(a64::x22));

-	Label gpr_addr_offset = c.newLabel();
 	const arm::GpX gpr_addr_reg = a64::x9;
-	c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
+	c.mov(gpr_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::gpr))));
 	c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
 	c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
 	c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
 	c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));

+	// GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code.
+	// Injected stack frames also work, but are not free and are completely unnecessary.
+	c.sub(a64::sp, a64::sp, Imm(4096));
+
 	// Execute LLE call
 	c.blr(call_target);

-	// Restore registers from the stack
-	c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
-	c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
-	c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
-	c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
-	c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
-	c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
-	c.ldr(a64::x30, arm::Mem(a64::sp, 96));
-	// Restore stack ptr
-	c.add(a64::sp, a64::sp, Imm(112));
-	// Return
-	c.ret(a64::x30);
+	// Return address after far jump. Reset sp and start unwinding...
+	c.bind(hv_ctx_pc);

-	c.bind(exec_addr);
-	c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
-	c.bind(base_addr);
-	c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
-	c.bind(cia_offset);
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
-	c.bind(gpr_addr_offset);
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
-	c.bind(native_sp_offset);
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
+	// Execution guard undo (unneded since we're going to hard-reset the SP)
+	//c.add(a64::sp, a64::sp, Imm(4096));
+
+	// We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg
+	// Either way, x20 contains our thread base and we forcefully reset the stack pointer
+	c.add(a64::x14, a64::x20, Imm(hv_register_array_offset));  // Per-thread context save
+
+	c.ldr(a64::x15, arm::Mem(a64::x14, 8));
+	c.ldp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
+	c.ldp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
+	c.ldp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
+	c.ldp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
+	c.ldp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
+	c.ldp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
+	c.ldr(a64::x30, arm::Mem(a64::x14, 112));
+
+	// Return
+	c.mov(a64::sp, a64::x15);
+	c.ret(a64::x30);
 #endif
 });

@ -390,11 +402,20 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_esc

 #if defined(ARCH_X64)
 	// Restore native stack pointer (longjmp emulation)
-	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
+	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));

 	// Return to the return location
 	c.sub(x86::rsp, 8);
 	c.ret();
+#else
+	// We really shouldn't be using this, but an implementation shoudln't hurt
+	// Far jump return. Only clobbers x30.
+	const arm::GpX ppu_t_base = a64::x20;
+	const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+	c.mov(ppu_t_base, args[0]);
+	c.mov(a64::x30, Imm(hv_register_array_offset));
+	c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30));
+	c.ret(a64::x30);
 #endif
 });

@ -2265,6 +2286,9 @@ void ppu_thread::exec_task()
 {
 	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
 	{
+		// HVContext push to allow recursion. This happens with guest callback invocations.
+		const auto old_hv_ctx = hv_ctx;
+
 		while (true)
 		{
 			if (state) [[unlikely]]
@ -2276,6 +2300,8 @@ void ppu_thread::exec_task()
 			ppu_gateway(this);
 		}

+		// HVContext pop
+		hv_ctx = old_hv_ctx;
 		return;
 	}

@ -2314,6 +2340,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
 {
 	prio.raw().prio = _prio;

+	memset(&hv_ctx, 0, sizeof(hv_ctx));
+
 	gpr[1] = stack_addr + stack_size - ppu_stack_start_offset;

 	gpr[13] = param.tls_addr;
@ -3502,7 +3530,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)

 			if (notify)
 			{
-				bool notified = false;
+bool notified = false;

 				if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128))
 				{
@ -5277,12 +5305,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 				// Translate
 				if (const auto func = translator.Translate(module_part.funcs[fi]))
 				{
+#ifdef ARCH_X64 // TODO
 					// Run optimization passes
 #if LLVM_VERSION_MAJOR < 17
 					pm.run(*func);
 #else
 					fpm.run(*func, fam);
 #endif
+#endif // ARCH_X64
 				}
 				else
 				{
@ -5297,12 +5327,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 		{
 			if (const auto func = translator.GetSymbolResolver(whole_module))
 			{
+#ifdef ARCH_X64 // TODO
 				// Run optimization passes
 #if LLVM_VERSION_MAJOR < 17
 				pm.run(*func);
 #else
 				fpm.run(*func, fam);
 #endif
+#endif // ARCH_X64
 			}
 			else
 			{
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -1,6 +1,7 @@
 #pragma once

 #include "../CPU/CPUThread.h"
+#include "../CPU/Hypervisor.h"
 #include "../Memory/vm_ptr.h"
 #include "Utilities/lockless.h"
 #include "Utilities/BitField.h"
@ -163,6 +164,9 @@ public:

 	using cpu_thread::operator=;

+	// Hypervisor context data
+	alignas(16) rpcs3::hypervisor_context_t hv_ctx; // HV context for gate enter exit. Keep at a low struct offset.
+
 	u64 gpr[32] = {}; // General-Purpose Registers
 	f64 fpr[32] = {}; // Floating Point Registers
 	v128 vr[32] = {}; // Vector Registers
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -208,8 +208,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 		m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease);

 		// Create tail call to the check function
-		Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
-		m_ir->CreateRetVoid();
+		VMEscape(Call(GetType<void>(), "__check", m_thread, GetAddr()));
 	}
 	else
 	{
@ -321,7 +320,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 	if (vec_addrs.empty())
 	{
 		// Possible special case for no functions (allowing the do-while optimization)
-		m_ir->CreateRetVoid();
+		m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path.
 		replace_intrinsics(*m_function);
 		return m_function;
 	}
@ -378,7 +377,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 	// Set insertion point to afterloop_block
 	m_ir->SetInsertPoint(after_loop);

-	m_ir->CreateRetVoid();
+	m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here

 	replace_intrinsics(*m_function);
 	return m_function;
@ -482,8 +481,8 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)

 		if (_target >= u32{umax})
 		{
-			Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
-			m_ir->CreateRetVoid();
+			auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
+			VMEscape(c);
 			return;
 		}
 		else if (_target >= caddr && _target <= cend)
@ -565,7 +564,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
 	const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)});
 	c->setTailCallKind(llvm::CallInst::TCK_Tail);
 	c->setCallingConv(CallingConv::GHC);
-	m_ir->CreateRetVoid();
+	VMEscape(c);
 }

 Value* PPUTranslator::RegInit(Value*& local)
@ -779,8 +778,8 @@ void PPUTranslator::TestAborted()
 	m_ir->SetInsertPoint(vcheck);

 	// Create tail call to the check function
-	Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
-	m_ir->CreateRetVoid();
+	auto c = Call(GetType<void>(), "__check", m_thread, GetAddr());
+	VMEscape(c);
 	m_ir->SetInsertPoint(body);
 }

@ -2206,16 +2205,14 @@ void PPUTranslator::SC(ppu_opcode_t op)

 		if (index < 1024)
 		{
-			Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
-			//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-			m_ir->CreateRetVoid();
+			auto c = Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
+			VMEscape(c, true);
 			return;
 		}
 	}

-	Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-	m_ir->CreateRetVoid();
+	auto c = Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
+	VMEscape(c, true);
 }

 void PPUTranslator::B(ppu_opcode_t op)
@ -2776,9 +2773,9 @@ void PPUTranslator::LWARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread);
-		//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-		m_ir->CreateRetVoid();
+
+		auto inst = Call(GetType<void>(), "__resinterp", m_thread);
+		VMEscape(inst, true);
 		return;
 	}

@ -2928,9 +2925,9 @@ void PPUTranslator::LDARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread);
-		//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-		m_ir->CreateRetVoid();
+
+		auto inst = Call(GetType<void>(), "__resinterp", m_thread);
+		VMEscape(inst, true);
 		return;
 	}

@ -4998,9 +4995,8 @@ void PPUTranslator::FCFID(ppu_opcode_t op)
 void PPUTranslator::UNK(ppu_opcode_t op)
 {
 	FlushRegisters();
-	Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-	m_ir->CreateRetVoid();
+	auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
+	VMEscape(c, true);
 }


@ -5279,9 +5275,8 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)

 void PPUTranslator::Trap()
 {
-	Call(GetType<void>(), "__trap", m_thread, GetAddr());
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
-	m_ir->CreateRetVoid();
+	auto c = Call(GetType<void>(), "__trap", m_thread, GetAddr());
+	VMEscape(c);
 }

 Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi)
@ -5328,6 +5323,42 @@ MDNode* PPUTranslator::CheckBranchProbability(u32 bo)
 	return nullptr;
 }

+void PPUTranslator::VMEscape([[maybe_unused]] llvm::CallInst* tail_call, [[maybe_unused]] bool skip_flush)
+{
+	//if (!skip_flush)
+	{
+		// Flush
+		FlushRegisters();
+	}
+
+#ifdef ARCH_X64
+	// Optionally flag last call as a tail
+	if (tail_call)
+	{
+		tail_call->setTailCall();
+	}
+
+	// This is actually AMD64 specific but good enough for now
+	m_ir->CreateRetVoid();
+#else
+
+	// Validation. Make sure we're escaping from a correct context. Only guest JIT should ever go through the "escape" gate.
+	const auto bb = m_ir->GetInsertPoint();
+	const auto arg = llvm::dyn_cast<llvm::Argument>(m_thread);
+	ensure(bb->getParent()->getName().str() == arg->getParent()->getName().str());
+
+	const u32 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+	const std::string asm_ = fmt::format(
+		"ldr x20, $0;\n"
+		"ldr x30, [x20, #%u];\n",
+		hv_register_array_offset);
+
+	LLVM_ASM(asm_, std::array{ m_thread }, "m", m_ir, m_function->getContext());
+	m_ir->CreateRetVoid();
+
+#endif
+}
+
 void PPUTranslator::build_interpreter()
 {
 #define BUILD_VEC_INST(i) { \
@ -5343,8 +5374,7 @@ void PPUTranslator::build_interpreter()
 		op.vb = 2; \
 		op.vc = 3; \
 		this->i(op); \
-		FlushRegisters(); \
-		m_ir->CreateRetVoid(); \
+		VMEscape(); \
 		replace_intrinsics(*m_function); \
 	}

--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -150,6 +150,9 @@ public:
 	// Emit function call
 	void CallFunction(u64 target, llvm::Value* indirect = nullptr);

+	// Emit escape sequence back to hypervisor
+	void VMEscape(llvm::CallInst* tail_call = nullptr, bool skip_flush = false);
+
 	// Emit state check mid-block
 	void TestAborted();