LLVM: Slice PPU executable memory

2025-03-14 10:21:21 +00:00 · 2025-01-10 17:34:24 +02:00 · 2025-01-10 17:34:24 +02:00 · 9d5b75bb7a
commit 9d5b75bb7a
parent 7b8fee7cdb
7 changed files with 486 additions and 126 deletions
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -514,8 +514,8 @@ class jit_compiler final
 	atomic_t<usz> m_disk_space = umax;

 public:
-	jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags = 0);
-	~jit_compiler();
+	jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags = 0, std::function<u64(const std::string&)> symbols_cement = {}) noexcept;
+	~jit_compiler() noexcept;

 	// Get LLVM context
 	auto& get_context()
--- a/Utilities/JITLLVM.cpp
+++ b/Utilities/JITLLVM.cpp
@ -77,8 +77,7 @@ static u64 make_null_function(const std::string& name)

 		if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < 0x8000'0000)
 		{
-			// Point the garbage to reserved, non-executable memory
-			return reinterpret_cast<u64>(vm::g_sudo_addr + addr);
+			fmt::throw_exception("Unhandled symbols cementing! (name='%s'", name);
 		}
 	}

@ -174,18 +173,34 @@ struct JITAnnouncer : llvm::JITEventListener
 struct MemoryManager1 : llvm::RTDyldMemoryManager
 {
 	// 256 MiB for code or data
-	static constexpr u64 c_max_size = 0x20000000 / 2;
+	static constexpr u64 c_max_size = 0x1000'0000;

 	// Allocation unit (2M)
 	static constexpr u64 c_page_size = 2 * 1024 * 1024;

-	// Reserve 512 MiB
-	u8* const ptr = static_cast<u8*>(utils::memory_reserve(c_max_size * 2));
+	// Reserve 256 MiB blocks
+	void* m_code_mems = nullptr;
+	void* m_data_ro_mems = nullptr;
+	void* m_data_rw_mems = nullptr;

 	u64 code_ptr = 0;
-	u64 data_ptr = c_max_size;
+	u64 data_ro_ptr = 0;
+	u64 data_rw_ptr = 0;

-	MemoryManager1() = default;
+	// First fallback for non-existing symbols
+	// May be a memory container internally
+	std::function<u64(const std::string&)> m_symbols_cement;
+
+	MemoryManager1(std::function<u64(const std::string&)> symbols_cement = {}) noexcept
+		: m_symbols_cement(std::move(symbols_cement))
+	{
+		auto ptr = reinterpret_cast<u8*>(utils::memory_reserve(c_max_size * 3));
+		m_code_mems = ptr;
+		// ptr += c_max_size;
+		// m_data_ro_mems = ptr;
+		 ptr += c_max_size;
+		m_data_rw_mems = ptr;
+	}

 	MemoryManager1(const MemoryManager1&) = delete;

@ -194,13 +209,22 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 	~MemoryManager1() override
 	{
 		// Hack: don't release to prevent reuse of address space, see jit_announce
-		utils::memory_decommit(ptr, c_max_size * 2);
+		// constexpr auto how_much = [](u64 pos) { return utils::align(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); };
+		// utils::memory_decommit(m_code_mems, how_much(code_ptr));
+		// utils::memory_decommit(m_data_ro_mems, how_much(data_ro_ptr));
+		// utils::memory_decommit(m_data_rw_mems, how_much(data_rw_ptr));
+		utils::memory_decommit(m_code_mems, c_max_size * 3);
 	}

 	llvm::JITSymbol findSymbol(const std::string& name) override
 	{
 		u64 addr = RTDyldMemoryManager::getSymbolAddress(name);

+		if (!addr && m_symbols_cement)
+		{
+			addr = m_symbols_cement(name);
+		}
+
 		if (!addr)
 		{
 			addr = make_null_function(name);
@ -214,45 +238,79 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 		return {addr, llvm::JITSymbolFlags::Exported};
 	}

-	u8* allocate(u64& oldp, uptr size, uint align, utils::protection prot)
+	u8* allocate(u64& alloc_pos, void* block, uptr size, u64 align, utils::protection prot)
 	{
-		if (align > c_page_size)
+		align = align ? align : 16;
+ 
+		const u64 sizea = utils::align(size, align);
+
+		if (!size || align > c_page_size || sizea > c_max_size || sizea < size)
 		{
-			jit_log.fatal("Unsupported alignment (size=0x%x, align=0x%x)", size, align);
+			jit_log.fatal("Unsupported size/alignment (size=0x%x, align=0x%x)", size, align);
 			return nullptr;
 		}

-		const u64 olda = utils::align(oldp, align);
-		const u64 newp = utils::align(olda + size, align);
+		u64 oldp = alloc_pos;

-		if ((newp - 1) / c_max_size != oldp / c_max_size)
+		u64 olda = utils::align(oldp, align);
+
+		ensure(olda >= oldp);
+		ensure(olda < ~sizea);
+
+		u64 newp = olda + sizea;
+
+		if ((newp - 1) / c_max_size != (oldp - 1) / c_max_size)
 		{
-			jit_log.fatal("Out of memory (size=0x%x, align=0x%x)", size, align);
-			return nullptr;
+			constexpr usz num_of_allocations = 1;
+
+			if ((newp - 1) / c_max_size > num_of_allocations)
+			{
+				// Allocating more than one region does not work for relocations, needs more robust solution
+				fmt::throw_exception("Out of memory (size=0x%x, align=0x%x)", size, align);
+			}
 		}

-		if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size)
+		// Update allocation counter
+		alloc_pos = newp;
+
+		constexpr usz page_quarter = c_page_size / 4;
+
+		// Optimization: split the first allocation to 512 KiB for single-module compilers
+		if (oldp < c_page_size && align < page_quarter && (std::min(newp, c_page_size) - 1) / page_quarter != (oldp - 1) / page_quarter)
+		{
+			const u64 pagea = utils::align(oldp, page_quarter);
+			const u64 psize = utils::align(std::min(newp, c_page_size) - pagea, page_quarter);
+			utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
+
+			// Advance
+			oldp = pagea + psize;
+		}
+
+		if ((newp - 1) / c_page_size != (oldp - 1) / c_page_size)
 		{
 			// Allocate pages on demand
 			const u64 pagea = utils::align(oldp, c_page_size);
 			const u64 psize = utils::align(newp - pagea, c_page_size);
-			utils::memory_commit(this->ptr + pagea, psize, prot);
+			utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
 		}

-		// Update allocation counter
-		oldp = newp;
-
-		return this->ptr + olda;
+		return reinterpret_cast<u8*>(block) + (olda % c_max_size);
 	}

 	u8* allocateCodeSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/) override
 	{
-		return allocate(code_ptr, size, align, utils::protection::wx);
+		return allocate(code_ptr, m_code_mems, size, align, utils::protection::wx);
 	}

-	u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool /*is_ro*/) override
+	u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool is_ro) override
 	{
-		return allocate(data_ptr, size, align, utils::protection::rw);
+		if (is_ro)
+		{
+			// Disabled
+			//return allocate(data_ro_ptr, m_data_ro_mems, size, align, utils::protection::rw);
+		}
+
+		return allocate(data_rw_ptr, m_data_rw_mems, size, align, utils::protection::rw);
 	}

 	bool finalizeMemory(std::string* = nullptr) override
@ -272,7 +330,14 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
 // Simple memory manager
 struct MemoryManager2 : llvm::RTDyldMemoryManager
 {
-	MemoryManager2() = default;
+	// First fallback for non-existing symbols
+	// May be a memory container internally
+	std::function<u64(const std::string&)> m_symbols_cement;
+
+	MemoryManager2(std::function<u64(const std::string&)> symbols_cement = {}) noexcept
+		: m_symbols_cement(std::move(symbols_cement))
+	{
+	}

 	~MemoryManager2() override
 	{
@ -282,6 +347,11 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
 	{
 		u64 addr = RTDyldMemoryManager::getSymbolAddress(name);

+		if (!addr && m_symbols_cement)
+		{
+			addr = m_symbols_cement(name);
+		}
+
 		if (!addr)
 		{
 			addr = make_null_function(name);
@ -561,7 +631,7 @@ bool jit_compiler::add_sub_disk_space(ssz space)
 	}).second;
 }

-jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags)
+jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags, std::function<u64(const std::string&)> symbols_cement) noexcept
 	: m_context(new llvm::LLVMContext)
 	, m_cpu(cpu(_cpu))
 {
@ -589,17 +659,17 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 		// Auxiliary JIT (does not use custom memory manager, only writes the objects)
 		if (flags & 0x1)
 		{
-			mem = std::make_unique<MemoryManager1>();
+			mem = std::make_unique<MemoryManager1>(std::move(symbols_cement));
 		}
 		else
 		{
-			mem = std::make_unique<MemoryManager2>();
+			mem = std::make_unique<MemoryManager2>(std::move(symbols_cement));
 			null_mod->setTargetTriple(jit_compiler::triple2());
 		}
 	}
 	else
 	{
-		mem = std::make_unique<MemoryManager1>();
+		mem = std::make_unique<MemoryManager1>(std::move(symbols_cement));
 	}

 	{
@ -648,7 +718,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 	}
 }

-jit_compiler::~jit_compiler()
+jit_compiler::~jit_compiler() noexcept
 {
 }

--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@ -769,7 +769,7 @@ public:
 		}

 		// Move the context (if movable)
-		new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::forward<Context>(f));
+		new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count), std::forward<Context>(f));
 	}

 	// Constructor with a function performed before adding more threads
--- a/rpcs3/Emu/Cell/PPUAnalyser.h
+++ b/rpcs3/Emu/Cell/PPUAnalyser.h
@ -4,6 +4,7 @@
 #include <map>
 #include <set>
 #include <deque>
+#include <span>
 #include "util/types.hpp"
 #include "util/endian.hpp"
 #include "util/asm.hpp"
@ -38,7 +39,51 @@ struct ppu_function
 	std::map<u32, u32> blocks{}; // Basic blocks: addr -> size
 	std::set<u32> calls{}; // Set of called functions
 	std::set<u32> callers{};
-	std::string name{}; // Function name
+	mutable std::string name{}; // Function name
+
+	struct iterator
+	{
+		const ppu_function* _this;
+		typename std::map<u32, u32>::const_iterator it;
+		usz index = 0;
+
+		std::pair<const u32, u32> operator*() const
+		{
+			return _this->blocks.empty() ? std::pair<const u32, u32>(_this->addr, _this->size) : *it;
+		}
+
+		iterator& operator++()
+		{
+			index++;
+
+			if (it != _this->blocks.end())
+			{
+				it++;
+			}
+
+			return *this;
+		}
+
+		bool operator==(const iterator& rhs) const noexcept
+		{
+			return it == rhs.it || (rhs.index == index && _this->blocks.empty());
+		}
+
+		bool operator!=(const iterator& rhs) const noexcept
+		{
+			return !operator==(rhs);
+		}
+	};
+
+	iterator begin() const
+	{
+		return iterator{this, blocks.begin()};
+	}
+
+	iterator end() const
+	{
+		return iterator{this, blocks.end(), 1};
+	}
 };

 // PPU Relocation Information
@ -87,18 +132,56 @@ struct ppu_module : public Type

 	ppu_module& operator=(ppu_module&&) noexcept = default;

-	uchar sha1[20]{};
-	std::string name{};
-	std::string path{};
+	uchar sha1[20]{}; // Hash
+	std::string name{}; // Filename
+	std::string path{}; // Filepath
 	s64 offset = 0; // Offset of file
-	std::string cache{};
-	std::vector<ppu_reloc> relocs{};
-	std::vector<ppu_segment> segs{};
-	std::vector<ppu_segment> secs{};
-	std::vector<ppu_function> funcs{};
-	std::vector<u32> applied_patches;
-	std::deque<std::shared_ptr<void>> allocations;
-	std::map<u32, u32> addr_to_seg_index;
+	mutable bs_t<ppu_attr> attr{}; // Shared module attributes
+	std::string cache{}; // Cache file path
+	std::vector<ppu_reloc> relocs{}; // Relocations
+	std::vector<ppu_segment> segs{}; // Segments
+	std::vector<ppu_segment> secs{}; // Segment sections
+	std::vector<ppu_function> funcs{}; // Function list
+	std::vector<u32> applied_patches; // Patch addresses
+	std::deque<std::shared_ptr<void>> allocations; // Segment memory allocations
+	std::map<u32, u32> addr_to_seg_index; // address->segment ordered translator map
+	ppu_module* parent = nullptr;
+	std::pair<u32, u32> local_bounds{0, u32{umax}}; // Module addresses range
+	std::shared_ptr<std::pair<u32, u32>> jit_bounds; // JIT instance modules addresses range
+
+	template <typename T>
+	auto as_span(T&& arg, bool bound_local, bool bound_jit) const
+	{
+		using unref = std::remove_reference_t<T>;
+		using type = std::conditional_t<std::is_const_v<unref>, std::add_const_t<typename unref::value_type>, typename unref::value_type>;
+
+		if (bound_local || bound_jit)
+		{
+			// Return span bound to specified bounds
+			const auto [min_addr, max_addr] = bound_jit ? *jit_bounds : local_bounds;
+			constexpr auto compare = [](const type& a, u32 addr) { return a.addr < addr; };
+			const auto end = arg.data() + arg.size();
+			const auto start = std::lower_bound(arg.data(), end, min_addr, compare);
+			return std::span<type>{ start, std::lower_bound(start, end, max_addr, compare) };
+		}
+
+		return std::span<type>(arg.data(), arg.size());
+	}
+
+	auto get_funcs(bool bound_local = true, bool bound_jit = false)
+	{
+		return as_span(parent ? parent->funcs : funcs, bound_local, bound_jit);
+	}
+
+	auto get_funcs(bool bound_local = true, bool bound_jit = false) const
+	{
+		return as_span(parent ? parent->funcs : funcs, bound_local, bound_jit);
+	}
+
+	auto get_relocs(bool bound_local = false) const
+	{
+		return as_span(parent ? parent->relocs : relocs, bound_local, false);
+	}

 	// Copy info without functions
 	void copy_part(const ppu_module& info)
@ -106,11 +189,12 @@ struct ppu_module : public Type
 		std::memcpy(sha1, info.sha1, sizeof(sha1));
 		name = info.name;
 		path = info.path;
-		relocs = info.relocs;
 		segs = info.segs;
-		secs = info.secs;
 		allocations = info.allocations;
 		addr_to_seg_index = info.addr_to_seg_index;
+		parent = const_cast<ppu_module*>(&info);
+		attr = info.attr;
+		local_bounds = {u32{umax}, 0}; // Initially empty range
 	}

 	bool analyse(u32 lib_toc, u32 entry, u32 end, const std::vector<u32>& applied, const std::vector<u32>& exported_funcs = std::vector<u32>{}, std::function<bool()> check_aborted = {});
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -66,6 +66,7 @@
 #include <cctype>
 #include <span>
 #include <optional>
+#include <charconv>

 #include "util/asm.hpp"
 #include "util/vm.hpp"
@ -176,7 +177,7 @@ bool serialize<ppu_thread::cr_bits>(utils::serial& ar, typename ppu_thread::cr_b
 extern void ppu_initialize();
 extern void ppu_finalize(const ppu_module<lv2_obj>& info, bool force_mem_release = false);
 extern bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only = false, u64 file_size = 0);
-static void ppu_initialize2(class jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module<lv2_obj>& whole_module);
+static void ppu_initialize2(class jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name);
 extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr);
 extern std::pair<shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr);
 extern void ppu_unload_prx(const lv2_prx&);
@ -342,11 +343,10 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	// Load offset value
 	c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
 	// Load cia
-	c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
+	c.ldr(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
+
 	// Multiply by 2 to index into ptr table
-	const arm::GpX index_shift = a64::x12;
-	c.mov(index_shift, Imm(2));
-	c.mul(pc, pc, index_shift);
+	c.add(pc, pc, pc);

 	// Load call target
 	const arm::GpX call_target = a64::x13;
@ -355,7 +355,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	const arm::GpX reg_hp = a64::x21;
 	c.mov(reg_hp, call_target);
 	c.lsr(reg_hp, reg_hp, 48);
-	c.lsl(a64::w21, a64::w21, 13);
+	c.lsl(reg_hp.w(), reg_hp.w(), 13);

 	// Zero top 16 bits of call target
 	c.lsl(call_target, call_target, Imm(16));
@ -3665,6 +3665,9 @@ struct jit_core_allocator
 	// Initialize global semaphore with the max number of threads
 	::semaphore<0x7fff> sem{std::max<s16>(thread_count, 1)};

+	// Mutex for special extra-large modules to compile alone
+	shared_mutex shared_mtx;
+
 	static s16 limit()
 	{
 		return static_cast<s16>(std::min<s32>(0x7fff, utils::get_thread_count()));
@ -3677,8 +3680,8 @@ namespace
 	// Compiled PPU module info
 	struct jit_module
 	{
-		void(*symbol_resolver)(u8*, u64) = nullptr;
-		std::shared_ptr<jit_compiler> pjit;
+		std::vector<void(*)(u8*, u64)> symbol_resolvers;
+		std::vector<std::shared_ptr<jit_compiler>> pjit;
 		bool init = false;
 	};

@ -3729,6 +3732,7 @@ namespace
 			}

 			to_destroy.pjit = std::move(found->second.pjit);
+			to_destroy.symbol_resolvers = std::move(found->second.symbol_resolvers);

 			bucket.map.erase(found);
 		}
@ -4445,7 +4449,7 @@ extern void ppu_initialize()

 	idm::select<lv2_obj, lv2_prx>([&](u32, lv2_prx& _module)
 	{
-		if (_module.funcs.empty())
+		if (_module.get_funcs().empty())
 		{
 			return;
 		}
@ -4556,7 +4560,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 		auto& ppu_toc = toc_manager.toc_map;

-		for (const auto& func : info.funcs)
+		for (const auto& func : info.get_funcs())
 		{
 			if (func.size && func.blocks.empty())
 			{
@ -4659,11 +4663,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 	jit_module& jit_mod = g_fxo->get<jit_module_manager>().get(cache_path + "_" + std::to_string(std::bit_cast<usz>(info.segs[0].ptr)));

 	// Compiler instance (deferred initialization)
-	std::shared_ptr<jit_compiler>& jit = jit_mod.pjit;
+	std::vector<std::shared_ptr<jit_compiler>>& jits = jit_mod.pjit;

 	// Split module into fragments <= 1 MiB
 	usz fpos = 0;

+	// Modules counted so far
+	usz module_counter = 0;
+
 	// Difference between function name and current location
 	const u32 reloc = info.relocs.empty() ? 0 : ::at32(info.segs, 0).addr;

@ -4684,14 +4691,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 	const cpu_thread* cpu = cpu_thread::get_current();

-	for (auto& func : info.funcs)
+	for (auto& func : info.get_funcs())
 	{
 		if (func.size == 0)
 		{
 			continue;
 		}

-		for (const auto& [addr, size] : func.blocks)
+		for (const auto [addr, size] : func)
 		{
 			if (size == 0)
 			{
@ -4724,26 +4731,138 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 	u32 total_compile = 0;

-	while (!jit_mod.init && fpos < info.funcs.size())
+	// Limit how many modules are per JIt instance
+	// Advantage to lower the limit:
+	// 1. Lowering contoniues memory requirements for allocations
+	// Its disadvantage:
+	// 1. B instruction can wander up to 16MB relatively to its range,
+	// each additional split of JIT instance results in a downgraded version of around (100% / N-1th) - (100% / Nth) percent of instructions
+	// where N is the total amunt of JIT instances
+	// Subject to change
+	constexpr u32 c_moudles_per_jit = 100;
+
+	std::shared_ptr<std::pair<u32, u32>> local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
+
+	const auto shared_runtime = make_shared<jit_runtime>();
+	const auto shared_map = make_shared<std::unordered_map<u32, u64>>();
+	const auto shared_mtx = make_shared<shared_mutex>();
+
+	auto symbols_cement = [runtime = shared_runtime, reloc, bound = info.segs[0].addr + info.segs[0].size - reloc, func_map = shared_map, shared_mtx](const std::string& name) -> u64
 	{
-		// Initialize compiler instance
-		if (!jit && is_being_used_in_emulation)
+		u32 func_addr = umax;
+
+		if (name.starts_with("__0x"))
 		{
-			jit = std::make_shared<jit_compiler>(s_link_table, g_cfg.core.llvm_cpu);
+			u32 addr = umax;
+			auto res = std::from_chars(name.c_str() + 4, name.c_str() + name.size(), addr, 16);
+
+			if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < bound)
+			{
+				func_addr = addr + reloc;
+			}
 		}

-		// Copy module information (TODO: optimize)
+		if (func_addr == umax)
+		{
+			return {};
+		}
+
+		reader_lock rlock(*shared_mtx);
+
+		if (auto it = func_map->find(func_addr); it != func_map->end())
+		{
+			return it->second;
+		}
+
+		rlock.upgrade();
+
+		u64& code_ptr = (*func_map)[func_addr];
+
+		if (code_ptr)
+		{
+			return +code_ptr;
+		}
+
+		using namespace asmjit;
+
+		auto func = build_function_asm<u8*(*)(ppu_thread&, u64, u8*, u64, u64, u64)>(name, [&](native_asm& c, auto& args)
+		{
+#if defined(ARCH_X64)
+			c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
+			c.mov(x86::edx, func_addr); // Load PC
+			c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);
+
+			c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::edx, 1, 0)); // Load call target
+			c.mov(x86::rdx, x86::rax);
+			c.shl(x86::rax, 16);
+			c.shr(x86::rax, 16);
+			c.shr(x86::rdx, 48);
+			c.shl(x86::edx, 13);
+			c.mov(x86::r12d, x86::edx); // Load relocation base
+			c.jmp(x86::rax);
+#else
+			// Load REG_Base - use absolute jump target to bypass rel jmp range limits
+			// X19 contains vm::g_exec_addr
+			const arm::GpX exec_addr = a64::x19;
+
+			// X20 contains ppu_thread*
+			const arm::GpX ppu_t_base = a64::x20;
+
+			// Load PC
+			const arm::GpX pc = a64::x15;
+			const arm::GpX cia_addr_reg = a64::x11;
+
+			// Load offset value
+			c.mov(cia_addr_reg, static_cast<u64>(::offset32(&ppu_thread::cia)));
+
+			// Update CIA
+			c.mov(pc.w(), func_addr);
+			c.str(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
+
+			// Multiply by 2 to index into ptr table
+			c.add(pc, pc, pc);
+
+			// Load call target
+			const arm::GpX call_target = a64::x13;
+			c.ldr(call_target, arm::Mem(exec_addr, pc));
+
+			// Compute REG_Hp
+			const arm::GpX reg_hp = a64::x21;
+			c.mov(reg_hp, call_target);
+			c.lsr(reg_hp, reg_hp, 48);
+			c.lsl(reg_hp.w(), reg_hp.w(), 13);
+
+			// Zero top 16 bits of call target
+			c.lsl(call_target, call_target, 16);
+			c.lsr(call_target, call_target, 16);
+
+			// Execute LLE call
+			c.br(call_target);
+#endif
+		}, runtime.get());
+
+		code_ptr = reinterpret_cast<u64>(func);
+		return code_ptr;
+	};
+
+	if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
+	{
+		info.attr += ppu_attr::has_mfvscr;
+	}
+
+	while (!jit_mod.init && fpos < info.get_funcs().size())
+	{
+		// Copy module information
 		ppu_module<lv2_obj> part;
 		part.copy_part(info);
-		part.funcs.reserve(16000);

 		// Overall block size in bytes
 		usz bsize = 0;
 		usz bcount = 0;

-		while (fpos < info.funcs.size())
+		while (fpos < info.get_funcs().size())
 		{
-			auto& func = info.funcs[fpos];
+			auto& func = info.get_funcs()[fpos];

 			if (!func.size)
 			{
@ -4767,9 +4886,9 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				{
 					auto far_jump = ensure(g_fxo->get<ppu_far_jumps_t>().gen_jump(source));

-					if (source == func.addr && jit)
+					if (source == func.addr)
 					{
-						jit->update_global_mapping(fmt::format("__0x%x", func.addr - reloc), reinterpret_cast<u64>(far_jump));
+						(*shared_map)[func.addr - reloc] = reinterpret_cast<u64>(far_jump);
 					}

 					ppu_register_function_at(source, 4, far_jump);
@ -4783,22 +4902,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				}
 			}

-			// Copy block or function entry
-			ppu_function& entry = part.funcs.emplace_back(func);
+			local_jit_bounds->first = std::min<u32>(local_jit_bounds->first, func.addr);
+			local_jit_bounds->second = std::max<u32>(local_jit_bounds->second, func.addr + func.size);
+
+			part.local_bounds.first = std::min<u32>(part.local_bounds.first, func.addr);
+			part.local_bounds.second = std::max<u32>(part.local_bounds.second, func.addr + func.size);

 			// Fixup some information
-			entry.name = fmt::format("__0x%x", entry.addr - reloc);
-
-			if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
-			{
-				// TODO
-				entry.attr += ppu_attr::has_mfvscr;
-			}
-
-			if (entry.blocks.empty())
-			{
-				entry.blocks.emplace(func.addr, func.size);
-			}
+			func.name = fmt::format("__0x%x", func.addr - reloc);

 			bsize += func.size;

@ -4815,7 +4926,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 			int has_dcbz = !!g_cfg.core.accurate_cache_line_stores;

-			for (const auto& func : part.funcs)
+			for (const auto& func : part.get_funcs())
 			{
 				if (func.size == 0)
 				{
@ -4827,7 +4938,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				sha1_update(&ctx, reinterpret_cast<const u8*>(&addr), sizeof(addr));
 				sha1_update(&ctx, reinterpret_cast<const u8*>(&size), sizeof(size));

-				for (const auto& block : func.blocks)
+				for (const auto block : func)
 				{
 					if (block.second == 0 || reloc)
 					{
@ -4898,7 +5009,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				sha1_update(&ctx, ensure(info.get_ptr<const u8>(func.addr)), func.size);
 			}

-			if (!workload.empty() && fpos >= info.funcs.size())
+			if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1)
 			{
 				// Hash the entire function grouped addresses for the integrity of the symbol resolver function
 				// Potentially occuring during patches
@ -4906,7 +5017,13 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 				std::vector<be_t<u32>> addrs;

-				for (const ppu_function& func : info.funcs)
+				constexpr auto compare = [](const ppu_function& a, u32 addr) { return a.addr < addr; };
+
+				const auto start = std::lower_bound(info.funcs.begin(), info.funcs.end(), local_jit_bounds->first, compare);
+
+				std::span<const ppu_function> span_range{ start, std::lower_bound(start, info.funcs.end(), local_jit_bounds->second, compare) };
+
+				for (const ppu_function& func : span_range)
 				{
 					if (func.size == 0)
 					{
@ -4919,7 +5036,13 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				// Hash its size too
 				addrs.emplace_back(::size32(addrs));

-				sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
+				if (module_counter != 0)
+				{
+					sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
+				}
+
+				part.jit_bounds = std::move(local_jit_bounds); 
+				local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
 			}

 			if (false)
@ -4974,7 +5097,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented");
 			if (g_cfg.core.ppu_use_nj_bit)
 				settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented");
-			if (fpos >= info.funcs.size())
+			if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1)
 				settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose

 			// Write version, hash, CPU, settings
@ -4986,6 +5109,8 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 			break;
 		}

+		module_counter++;
+
 		if (!check_only)
 		{
 			total_compile++;
@ -4996,13 +5121,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 		// Check object file
 		if (jit_compiler::check(cache_path + obj_name))
 		{
-			if (!jit && !check_only)
+			if (!is_being_used_in_emulation && !check_only)
 			{
 				ppu_log.success("LLVM: Module exists: %s", obj_name);

 				// Done already, revert total amount increase
 				// Avoid incrementing "pdone" instead because it creates false appreciation for both the progress dialog and the user
 				total_compile--;
+				link_workload.pop_back();
 			}

 			continue;
@ -5113,11 +5239,26 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 					// Keep allocating workload
 					const auto& [obj_name, part] = std::as_const(workload)[i];

+					std::shared_lock rlock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
+					std::unique_lock lock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
+
+					if (part.jit_bounds && part.parent->funcs.size() >= 0x8000)
+					{
+						// Make a large symbol-resolving function compile alone because it has massive memory requirements
+						lock.lock();
+					}
+					else
+					{
+						rlock.lock();
+					}
+
 					ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name);

-					// Use another JIT instance
-					jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
-					ppu_initialize2(jit2, part, cache_path, obj_name, i == workload.size() - 1 ? main_module : part);
+					{
+						// Use another JIT instance
+						jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
+						ppu_initialize2(jit2, part, cache_path, obj_name);
+					}

 					ppu_log.success("LLVM: Compiled module %s", obj_name);
 				}
@ -5145,6 +5286,17 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 		g_watchdog_hold_ctr--;
 	}

+	// Initialize compiler instance
+	while (jits.size() < utils::aligned_div<u64>(module_counter, c_moudles_per_jit) && is_being_used_in_emulation)
+	{
+		jits.emplace_back(std::make_shared<jit_compiler>(s_link_table, g_cfg.core.llvm_cpu, 0, symbols_cement));
+	}
+
+	if (jit_mod.symbol_resolvers.empty() && is_being_used_in_emulation)
+	{
+		jit_mod.symbol_resolvers.resize(jits.size());
+	}
+
 	bool failed_to_load = false;
 	{
 		if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()))
@ -5158,14 +5310,18 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 			*progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_LINKING_PPU_MODULES);
 		}

+		usz mod_index = umax;
+
 		for (const auto& [obj_name, is_compiled] : link_workload)
 		{
+			mod_index++;
+
 			if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
 			{
 				break;
 			}

-			if (!failed_to_load && !jit->add(cache_path + obj_name))
+			if (!failed_to_load && !jits[mod_index / c_moudles_per_jit]->add(cache_path + obj_name))
 			{
 				ppu_log.error("LLVM: Failed to load module %s", obj_name);
 				failed_to_load = true;
@ -5205,10 +5361,10 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 	progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_APPLYING_PPU_CODE);

-	if (!jit)
+	if (jits.empty())
 	{
 		// No functions - nothing to do
-		ensure(info.funcs.empty());
+		ensure(info.get_funcs().empty());
 		return compiled_new;
 	}

@ -5216,25 +5372,27 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 	if (is_first)
 	{
-		jit->fin();
-	}
-
-	if (is_first)
-	{
-		jit_mod.symbol_resolver = reinterpret_cast<void(*)(u8*, u64)>(jit->get("__resolve_symbols"));
-		ensure(jit_mod.symbol_resolver);
-	}
-	else
-	{
-		ensure(jit_mod.symbol_resolver);
+		for (auto& jit : jits)
+		{
+			jit->fin();
+		}
 	}

 #ifdef __APPLE__
 	// Symbol resolver is in JIT mem, so we must enable execution
 	pthread_jit_write_protect_np(true);
 #endif
+	{
+		usz index = umax;

-	jit_mod.symbol_resolver(vm::g_exec_addr, info.segs[0].addr);
+		for (auto& sim : jit_mod.symbol_resolvers)
+		{
+			index++;
+
+			sim = ensure(!is_first ? sim : reinterpret_cast<void(*)(u8*, u64)>(jits[index]->get("__resolve_symbols")));
+			sim(vm::g_exec_addr, info.segs[0].addr);
+		}
+	}

 #ifdef __APPLE__
 	// Symbol resolver is in JIT mem, so we must enable execution
@ -5242,7 +5400,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 #endif

 	// Find a BLR-only function in order to copy it to all BLRs (some games need it)
-	for (const auto& func : info.funcs)
+	for (const auto& func : info.get_funcs())
 	{
 		if (func.size == 4 && *info.get_ptr<u32>(func.addr) == ppu_instructions::BLR())
 		{
@ -5281,7 +5439,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 #endif
 }

-static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module<lv2_obj>& whole_module)
+static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name)
 {
 #ifdef LLVM_AVAILABLE
 	using namespace llvm;
@ -5307,8 +5465,11 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
 		translator.get_type<u64>(), // r2
 		}, false);

+	// Difference between function name and current location
+	const u32 reloc = module_part.get_relocs().empty() ? 0 : ::at32(module_part.segs, 0).addr;
+
 	// Initialize function list
-	for (const auto& func : module_part.funcs)
+	for (const auto& func : module_part.get_funcs())
 	{
 		if (func.size)
 		{
@ -5374,8 +5535,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
 		fpm.addPass(EarlyCSEPass());
 #endif

+		u32 guest_code_size = 0;
+		u32 min_addr = umax;
+		u32 max_addr = 0;
+		u32 num_func = 0;
+
 		// Translate functions
-		for (usz fi = 0, fmax = module_part.funcs.size(); fi < fmax; fi++)
+		// Start with the lowest bound of the module, function list is sorted
+		for (const auto& mod_func : module_part.get_funcs())
 		{
 			if (Emu.IsStopped())
 			{
@ -5383,10 +5550,15 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
 				return;
 			}

-			if (module_part.funcs[fi].size)
+			if (mod_func.size)
 			{
+				num_func++;
+				guest_code_size += mod_func.size;
+				max_addr = std::max<u32>(max_addr, mod_func.addr + mod_func.size);
+				min_addr = std::min<u32>(min_addr, mod_func.addr);
+
 				// Translate
-				if (const auto func = translator.Translate(module_part.funcs[fi]))
+				if (const auto func = translator.Translate(mod_func))
 				{
 #ifdef ARCH_X64 // TODO
 					// Run optimization passes
@ -5405,10 +5577,10 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
 			}
 		}

-		// Run this only in one module for all functions
-		if (&whole_module != &module_part)
+		// Run this only in one module for all functions compiled
+		if (module_part.jit_bounds)
 		{
-			if (const auto func = translator.GetSymbolResolver(whole_module))
+			if (const auto func = translator.GetSymbolResolver(module_part))
 			{
 #ifdef ARCH_X64 // TODO
 				// Run optimization passes
@ -5452,7 +5624,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
 			return;
 		}

-		ppu_log.notice("LLVM: %zu functions generated", _module->getFunctionList().size());
+		ppu_log.notice("LLVM: %zu functions generated (code_size=0x%x, num_func=%d, max_addr(-)min_addr=0x%x)", _module->getFunctionList().size(), guest_code_size, num_func, max_addr - min_addr);
 	}

 	// Load or compile module
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -114,7 +114,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 	const auto caddr = m_info.segs[0].addr;
 	const auto cend = caddr + m_info.segs[0].size;

-	for (const auto& rel : m_info.relocs)
+	for (const auto& rel : m_info.get_relocs())
 	{
 		if (rel.addr >= caddr && rel.addr < cend)
 		{
@ -162,7 +162,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 		}
 	}

-	if (!m_info.relocs.empty())
+	if (!m_info.get_relocs().empty())
 	{
 		m_reloc = &m_info.segs[0];
 	}
@ -196,7 +196,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 	// Instruction address is (m_addr + base)
 	const u64 base = m_reloc ? m_reloc->addr : 0;
 	m_addr = info.addr - base;
-	m_attr = info.attr;
+	m_attr = m_info.attr + info.attr;

 	// Don't emit check in small blocks without terminator
 	bool need_check = info.size >= 16;
@ -325,6 +325,9 @@ Function* PPUTranslator::Translate(const ppu_function& info)

 Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
 {
+	ensure(m_module->getFunction("__resolve_symbols") == nullptr);
+	ensure(info.jit_bounds);
+
 	m_function = cast<Function>(m_module->getOrInsertFunction("__resolve_symbols", FunctionType::get(get_type<void>(), { get_type<u8*>(), get_type<u64>() }, false)).getCallee());

 	IRBuilder<> irb(BasicBlock::Create(m_context, "__entry", m_function));
@ -351,12 +354,13 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
 	// This is made in loop instead of inlined because it took tremendous amount of time to compile.

 	std::vector<u32> vec_addrs;
-	vec_addrs.reserve(info.funcs.size());

 	// Create an array of function pointers
 	std::vector<llvm::Constant*> functions;

-	for (const auto& f : info.funcs)
+	const auto [min_addr, max_addr] = *ensure(info.jit_bounds);
+
+	for (const auto& f : info.get_funcs(false, true))
 	{
 		if (!f.size)
 		{
@ -379,7 +383,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
 	const auto addr_array = new GlobalVariable(*m_module, addr_array_type, false, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, vec_addrs));

 	// Create an array of function pointers
-	const auto func_table_type = ArrayType::get(ftype->getPointerTo(), info.funcs.size());
+	const auto func_table_type = ArrayType::get(ftype->getPointerTo(), functions.size());
 	const auto init_func_table = ConstantArray::get(func_table_type, functions);
 	const auto func_table = new GlobalVariable(*m_module, func_table_type, false, GlobalVariable::PrivateLinkage, init_func_table);

--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@ -310,6 +310,11 @@ namespace utils

 	void memory_commit(void* pointer, usz size, protection prot)
 	{
+		if (!size)
+		{
+			return;
+		}
+
 #ifdef _WIN32
 		ensure(::VirtualAlloc(pointer, size, MEM_COMMIT, +prot));
 #else
@ -329,6 +334,11 @@ namespace utils

 	void memory_decommit(void* pointer, usz size)
 	{
+		if (!size)
+		{
+			return;
+		}
+
 #ifdef _WIN32
 		ensure(::VirtualFree(pointer, size, MEM_DECOMMIT));
 #else
@ -357,6 +367,11 @@ namespace utils

 	void memory_reset(void* pointer, usz size, protection prot)
 	{
+		if (!size)
+		{
+			return;
+		}
+
 #ifdef _WIN32
 		memory_decommit(pointer, size);
 		memory_commit(pointer, size, prot);
@ -390,6 +405,11 @@ namespace utils

 	void memory_release(void* pointer, usz size)
 	{
+		if (!size)
+		{
+			return;
+		}
+
 #ifdef _WIN32
 		unmap_mappping_memory(reinterpret_cast<u64>(pointer), size);
 		ensure(::VirtualFree(pointer, 0, MEM_RELEASE));
@ -400,6 +420,11 @@ namespace utils

 	void memory_protect(void* pointer, usz size, protection prot)
 	{
+		if (!size)
+		{
+			return;
+		}
+
 #ifdef _WIN32

 		DWORD old;
@ -429,6 +454,11 @@ namespace utils

 	bool memory_lock(void* pointer, usz size)
 	{
+		if (!size)
+		{
+			return true;
+		}
+
 #ifdef _WIN32
 		return ::VirtualLock(pointer, size);
 #else