LLVM: Slice PPU executable memory

This commit is contained in:
Elad 2025-01-10 17:34:24 +02:00
parent 7b8fee7cdb
commit 9d5b75bb7a
7 changed files with 486 additions and 126 deletions

View File

@ -514,8 +514,8 @@ class jit_compiler final
atomic_t<usz> m_disk_space = umax;
public:
jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags = 0);
~jit_compiler();
jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags = 0, std::function<u64(const std::string&)> symbols_cement = {}) noexcept;
~jit_compiler() noexcept;
// Get LLVM context
auto& get_context()

View File

@ -77,8 +77,7 @@ static u64 make_null_function(const std::string& name)
if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < 0x8000'0000)
{
// Point the garbage to reserved, non-executable memory
return reinterpret_cast<u64>(vm::g_sudo_addr + addr);
fmt::throw_exception("Unhandled symbols cementing! (name='%s'", name);
}
}
@ -174,18 +173,34 @@ struct JITAnnouncer : llvm::JITEventListener
struct MemoryManager1 : llvm::RTDyldMemoryManager
{
// 256 MiB for code or data
static constexpr u64 c_max_size = 0x20000000 / 2;
static constexpr u64 c_max_size = 0x1000'0000;
// Allocation unit (2M)
static constexpr u64 c_page_size = 2 * 1024 * 1024;
// Reserve 512 MiB
u8* const ptr = static_cast<u8*>(utils::memory_reserve(c_max_size * 2));
// Reserve 256 MiB blocks
void* m_code_mems = nullptr;
void* m_data_ro_mems = nullptr;
void* m_data_rw_mems = nullptr;
u64 code_ptr = 0;
u64 data_ptr = c_max_size;
u64 data_ro_ptr = 0;
u64 data_rw_ptr = 0;
MemoryManager1() = default;
// First fallback for non-existing symbols
// May be a memory container internally
std::function<u64(const std::string&)> m_symbols_cement;
MemoryManager1(std::function<u64(const std::string&)> symbols_cement = {}) noexcept
: m_symbols_cement(std::move(symbols_cement))
{
auto ptr = reinterpret_cast<u8*>(utils::memory_reserve(c_max_size * 3));
m_code_mems = ptr;
// ptr += c_max_size;
// m_data_ro_mems = ptr;
ptr += c_max_size;
m_data_rw_mems = ptr;
}
MemoryManager1(const MemoryManager1&) = delete;
@ -194,13 +209,22 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
~MemoryManager1() override
{
// Hack: don't release to prevent reuse of address space, see jit_announce
utils::memory_decommit(ptr, c_max_size * 2);
// constexpr auto how_much = [](u64 pos) { return utils::align(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); };
// utils::memory_decommit(m_code_mems, how_much(code_ptr));
// utils::memory_decommit(m_data_ro_mems, how_much(data_ro_ptr));
// utils::memory_decommit(m_data_rw_mems, how_much(data_rw_ptr));
utils::memory_decommit(m_code_mems, c_max_size * 3);
}
llvm::JITSymbol findSymbol(const std::string& name) override
{
u64 addr = RTDyldMemoryManager::getSymbolAddress(name);
if (!addr && m_symbols_cement)
{
addr = m_symbols_cement(name);
}
if (!addr)
{
addr = make_null_function(name);
@ -214,45 +238,79 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
return {addr, llvm::JITSymbolFlags::Exported};
}
u8* allocate(u64& oldp, uptr size, uint align, utils::protection prot)
u8* allocate(u64& alloc_pos, void* block, uptr size, u64 align, utils::protection prot)
{
if (align > c_page_size)
align = align ? align : 16;
const u64 sizea = utils::align(size, align);
if (!size || align > c_page_size || sizea > c_max_size || sizea < size)
{
jit_log.fatal("Unsupported alignment (size=0x%x, align=0x%x)", size, align);
jit_log.fatal("Unsupported size/alignment (size=0x%x, align=0x%x)", size, align);
return nullptr;
}
const u64 olda = utils::align(oldp, align);
const u64 newp = utils::align(olda + size, align);
u64 oldp = alloc_pos;
if ((newp - 1) / c_max_size != oldp / c_max_size)
u64 olda = utils::align(oldp, align);
ensure(olda >= oldp);
ensure(olda < ~sizea);
u64 newp = olda + sizea;
if ((newp - 1) / c_max_size != (oldp - 1) / c_max_size)
{
jit_log.fatal("Out of memory (size=0x%x, align=0x%x)", size, align);
return nullptr;
constexpr usz num_of_allocations = 1;
if ((newp - 1) / c_max_size > num_of_allocations)
{
// Allocating more than one region does not work for relocations, needs more robust solution
fmt::throw_exception("Out of memory (size=0x%x, align=0x%x)", size, align);
}
}
if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size)
// Update allocation counter
alloc_pos = newp;
constexpr usz page_quarter = c_page_size / 4;
// Optimization: split the first allocation to 512 KiB for single-module compilers
if (oldp < c_page_size && align < page_quarter && (std::min(newp, c_page_size) - 1) / page_quarter != (oldp - 1) / page_quarter)
{
const u64 pagea = utils::align(oldp, page_quarter);
const u64 psize = utils::align(std::min(newp, c_page_size) - pagea, page_quarter);
utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
// Advance
oldp = pagea + psize;
}
if ((newp - 1) / c_page_size != (oldp - 1) / c_page_size)
{
// Allocate pages on demand
const u64 pagea = utils::align(oldp, c_page_size);
const u64 psize = utils::align(newp - pagea, c_page_size);
utils::memory_commit(this->ptr + pagea, psize, prot);
utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
}
// Update allocation counter
oldp = newp;
return this->ptr + olda;
return reinterpret_cast<u8*>(block) + (olda % c_max_size);
}
u8* allocateCodeSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/) override
{
return allocate(code_ptr, size, align, utils::protection::wx);
return allocate(code_ptr, m_code_mems, size, align, utils::protection::wx);
}
u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool /*is_ro*/) override
u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool is_ro) override
{
return allocate(data_ptr, size, align, utils::protection::rw);
if (is_ro)
{
// Disabled
//return allocate(data_ro_ptr, m_data_ro_mems, size, align, utils::protection::rw);
}
return allocate(data_rw_ptr, m_data_rw_mems, size, align, utils::protection::rw);
}
bool finalizeMemory(std::string* = nullptr) override
@ -272,7 +330,14 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
// Simple memory manager
struct MemoryManager2 : llvm::RTDyldMemoryManager
{
MemoryManager2() = default;
// First fallback for non-existing symbols
// May be a memory container internally
std::function<u64(const std::string&)> m_symbols_cement;
MemoryManager2(std::function<u64(const std::string&)> symbols_cement = {}) noexcept
: m_symbols_cement(std::move(symbols_cement))
{
}
~MemoryManager2() override
{
@ -282,6 +347,11 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
{
u64 addr = RTDyldMemoryManager::getSymbolAddress(name);
if (!addr && m_symbols_cement)
{
addr = m_symbols_cement(name);
}
if (!addr)
{
addr = make_null_function(name);
@ -561,7 +631,7 @@ bool jit_compiler::add_sub_disk_space(ssz space)
}).second;
}
jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags)
jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, const std::string& _cpu, u32 flags, std::function<u64(const std::string&)> symbols_cement) noexcept
: m_context(new llvm::LLVMContext)
, m_cpu(cpu(_cpu))
{
@ -589,17 +659,17 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
// Auxiliary JIT (does not use custom memory manager, only writes the objects)
if (flags & 0x1)
{
mem = std::make_unique<MemoryManager1>();
mem = std::make_unique<MemoryManager1>(std::move(symbols_cement));
}
else
{
mem = std::make_unique<MemoryManager2>();
mem = std::make_unique<MemoryManager2>(std::move(symbols_cement));
null_mod->setTargetTriple(jit_compiler::triple2());
}
}
else
{
mem = std::make_unique<MemoryManager1>();
mem = std::make_unique<MemoryManager1>(std::move(symbols_cement));
}
{
@ -648,7 +718,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
}
}
jit_compiler::~jit_compiler()
jit_compiler::~jit_compiler() noexcept
{
}

View File

@ -769,7 +769,7 @@ public:
}
// Move the context (if movable)
new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::forward<Context>(f));
new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count), std::forward<Context>(f));
}
// Constructor with a function performed before adding more threads

View File

@ -4,6 +4,7 @@
#include <map>
#include <set>
#include <deque>
#include <span>
#include "util/types.hpp"
#include "util/endian.hpp"
#include "util/asm.hpp"
@ -38,7 +39,51 @@ struct ppu_function
std::map<u32, u32> blocks{}; // Basic blocks: addr -> size
std::set<u32> calls{}; // Set of called functions
std::set<u32> callers{};
std::string name{}; // Function name
mutable std::string name{}; // Function name
struct iterator
{
const ppu_function* _this;
typename std::map<u32, u32>::const_iterator it;
usz index = 0;
std::pair<const u32, u32> operator*() const
{
return _this->blocks.empty() ? std::pair<const u32, u32>(_this->addr, _this->size) : *it;
}
iterator& operator++()
{
index++;
if (it != _this->blocks.end())
{
it++;
}
return *this;
}
bool operator==(const iterator& rhs) const noexcept
{
return it == rhs.it || (rhs.index == index && _this->blocks.empty());
}
bool operator!=(const iterator& rhs) const noexcept
{
return !operator==(rhs);
}
};
iterator begin() const
{
return iterator{this, blocks.begin()};
}
iterator end() const
{
return iterator{this, blocks.end(), 1};
}
};
// PPU Relocation Information
@ -87,18 +132,56 @@ struct ppu_module : public Type
ppu_module& operator=(ppu_module&&) noexcept = default;
uchar sha1[20]{};
std::string name{};
std::string path{};
uchar sha1[20]{}; // Hash
std::string name{}; // Filename
std::string path{}; // Filepath
s64 offset = 0; // Offset of file
std::string cache{};
std::vector<ppu_reloc> relocs{};
std::vector<ppu_segment> segs{};
std::vector<ppu_segment> secs{};
std::vector<ppu_function> funcs{};
std::vector<u32> applied_patches;
std::deque<std::shared_ptr<void>> allocations;
std::map<u32, u32> addr_to_seg_index;
mutable bs_t<ppu_attr> attr{}; // Shared module attributes
std::string cache{}; // Cache file path
std::vector<ppu_reloc> relocs{}; // Relocations
std::vector<ppu_segment> segs{}; // Segments
std::vector<ppu_segment> secs{}; // Segment sections
std::vector<ppu_function> funcs{}; // Function list
std::vector<u32> applied_patches; // Patch addresses
std::deque<std::shared_ptr<void>> allocations; // Segment memory allocations
std::map<u32, u32> addr_to_seg_index; // address->segment ordered translator map
ppu_module* parent = nullptr;
std::pair<u32, u32> local_bounds{0, u32{umax}}; // Module addresses range
std::shared_ptr<std::pair<u32, u32>> jit_bounds; // JIT instance modules addresses range
template <typename T>
auto as_span(T&& arg, bool bound_local, bool bound_jit) const
{
using unref = std::remove_reference_t<T>;
using type = std::conditional_t<std::is_const_v<unref>, std::add_const_t<typename unref::value_type>, typename unref::value_type>;
if (bound_local || bound_jit)
{
// Return span bound to specified bounds
const auto [min_addr, max_addr] = bound_jit ? *jit_bounds : local_bounds;
constexpr auto compare = [](const type& a, u32 addr) { return a.addr < addr; };
const auto end = arg.data() + arg.size();
const auto start = std::lower_bound(arg.data(), end, min_addr, compare);
return std::span<type>{ start, std::lower_bound(start, end, max_addr, compare) };
}
return std::span<type>(arg.data(), arg.size());
}
auto get_funcs(bool bound_local = true, bool bound_jit = false)
{
return as_span(parent ? parent->funcs : funcs, bound_local, bound_jit);
}
auto get_funcs(bool bound_local = true, bool bound_jit = false) const
{
return as_span(parent ? parent->funcs : funcs, bound_local, bound_jit);
}
auto get_relocs(bool bound_local = false) const
{
return as_span(parent ? parent->relocs : relocs, bound_local, false);
}
// Copy info without functions
void copy_part(const ppu_module& info)
@ -106,11 +189,12 @@ struct ppu_module : public Type
std::memcpy(sha1, info.sha1, sizeof(sha1));
name = info.name;
path = info.path;
relocs = info.relocs;
segs = info.segs;
secs = info.secs;
allocations = info.allocations;
addr_to_seg_index = info.addr_to_seg_index;
parent = const_cast<ppu_module*>(&info);
attr = info.attr;
local_bounds = {u32{umax}, 0}; // Initially empty range
}
bool analyse(u32 lib_toc, u32 entry, u32 end, const std::vector<u32>& applied, const std::vector<u32>& exported_funcs = std::vector<u32>{}, std::function<bool()> check_aborted = {});

View File

@ -66,6 +66,7 @@
#include <cctype>
#include <span>
#include <optional>
#include <charconv>
#include "util/asm.hpp"
#include "util/vm.hpp"
@ -176,7 +177,7 @@ bool serialize<ppu_thread::cr_bits>(utils::serial& ar, typename ppu_thread::cr_b
extern void ppu_initialize();
extern void ppu_finalize(const ppu_module<lv2_obj>& info, bool force_mem_release = false);
extern bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only = false, u64 file_size = 0);
static void ppu_initialize2(class jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module<lv2_obj>& whole_module);
static void ppu_initialize2(class jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name);
extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr);
extern std::pair<shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr);
extern void ppu_unload_prx(const lv2_prx&);
@ -342,11 +343,10 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
// Load offset value
c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
// Load cia
c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
c.ldr(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
// Multiply by 2 to index into ptr table
const arm::GpX index_shift = a64::x12;
c.mov(index_shift, Imm(2));
c.mul(pc, pc, index_shift);
c.add(pc, pc, pc);
// Load call target
const arm::GpX call_target = a64::x13;
@ -355,7 +355,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
const arm::GpX reg_hp = a64::x21;
c.mov(reg_hp, call_target);
c.lsr(reg_hp, reg_hp, 48);
c.lsl(a64::w21, a64::w21, 13);
c.lsl(reg_hp.w(), reg_hp.w(), 13);
// Zero top 16 bits of call target
c.lsl(call_target, call_target, Imm(16));
@ -3665,6 +3665,9 @@ struct jit_core_allocator
// Initialize global semaphore with the max number of threads
::semaphore<0x7fff> sem{std::max<s16>(thread_count, 1)};
// Mutex for special extra-large modules to compile alone
shared_mutex shared_mtx;
static s16 limit()
{
return static_cast<s16>(std::min<s32>(0x7fff, utils::get_thread_count()));
@ -3677,8 +3680,8 @@ namespace
// Compiled PPU module info
struct jit_module
{
void(*symbol_resolver)(u8*, u64) = nullptr;
std::shared_ptr<jit_compiler> pjit;
std::vector<void(*)(u8*, u64)> symbol_resolvers;
std::vector<std::shared_ptr<jit_compiler>> pjit;
bool init = false;
};
@ -3729,6 +3732,7 @@ namespace
}
to_destroy.pjit = std::move(found->second.pjit);
to_destroy.symbol_resolvers = std::move(found->second.symbol_resolvers);
bucket.map.erase(found);
}
@ -4445,7 +4449,7 @@ extern void ppu_initialize()
idm::select<lv2_obj, lv2_prx>([&](u32, lv2_prx& _module)
{
if (_module.funcs.empty())
if (_module.get_funcs().empty())
{
return;
}
@ -4556,7 +4560,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
auto& ppu_toc = toc_manager.toc_map;
for (const auto& func : info.funcs)
for (const auto& func : info.get_funcs())
{
if (func.size && func.blocks.empty())
{
@ -4659,11 +4663,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
jit_module& jit_mod = g_fxo->get<jit_module_manager>().get(cache_path + "_" + std::to_string(std::bit_cast<usz>(info.segs[0].ptr)));
// Compiler instance (deferred initialization)
std::shared_ptr<jit_compiler>& jit = jit_mod.pjit;
std::vector<std::shared_ptr<jit_compiler>>& jits = jit_mod.pjit;
// Split module into fragments <= 1 MiB
usz fpos = 0;
// Modules counted so far
usz module_counter = 0;
// Difference between function name and current location
const u32 reloc = info.relocs.empty() ? 0 : ::at32(info.segs, 0).addr;
@ -4684,14 +4691,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
const cpu_thread* cpu = cpu_thread::get_current();
for (auto& func : info.funcs)
for (auto& func : info.get_funcs())
{
if (func.size == 0)
{
continue;
}
for (const auto& [addr, size] : func.blocks)
for (const auto [addr, size] : func)
{
if (size == 0)
{
@ -4724,26 +4731,138 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
u32 total_compile = 0;
while (!jit_mod.init && fpos < info.funcs.size())
// Limit how many modules are per JIt instance
// Advantage to lower the limit:
// 1. Lowering contoniues memory requirements for allocations
// Its disadvantage:
// 1. B instruction can wander up to 16MB relatively to its range,
// each additional split of JIT instance results in a downgraded version of around (100% / N-1th) - (100% / Nth) percent of instructions
// where N is the total amunt of JIT instances
// Subject to change
constexpr u32 c_moudles_per_jit = 100;
std::shared_ptr<std::pair<u32, u32>> local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
const auto shared_runtime = make_shared<jit_runtime>();
const auto shared_map = make_shared<std::unordered_map<u32, u64>>();
const auto shared_mtx = make_shared<shared_mutex>();
auto symbols_cement = [runtime = shared_runtime, reloc, bound = info.segs[0].addr + info.segs[0].size - reloc, func_map = shared_map, shared_mtx](const std::string& name) -> u64
{
// Initialize compiler instance
if (!jit && is_being_used_in_emulation)
u32 func_addr = umax;
if (name.starts_with("__0x"))
{
jit = std::make_shared<jit_compiler>(s_link_table, g_cfg.core.llvm_cpu);
u32 addr = umax;
auto res = std::from_chars(name.c_str() + 4, name.c_str() + name.size(), addr, 16);
if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < bound)
{
func_addr = addr + reloc;
}
}
// Copy module information (TODO: optimize)
if (func_addr == umax)
{
return {};
}
reader_lock rlock(*shared_mtx);
if (auto it = func_map->find(func_addr); it != func_map->end())
{
return it->second;
}
rlock.upgrade();
u64& code_ptr = (*func_map)[func_addr];
if (code_ptr)
{
return +code_ptr;
}
using namespace asmjit;
auto func = build_function_asm<u8*(*)(ppu_thread&, u64, u8*, u64, u64, u64)>(name, [&](native_asm& c, auto& args)
{
#if defined(ARCH_X64)
c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
c.mov(x86::edx, func_addr); // Load PC
c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);
c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::edx, 1, 0)); // Load call target
c.mov(x86::rdx, x86::rax);
c.shl(x86::rax, 16);
c.shr(x86::rax, 16);
c.shr(x86::rdx, 48);
c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base
c.jmp(x86::rax);
#else
// Load REG_Base - use absolute jump target to bypass rel jmp range limits
// X19 contains vm::g_exec_addr
const arm::GpX exec_addr = a64::x19;
// X20 contains ppu_thread*
const arm::GpX ppu_t_base = a64::x20;
// Load PC
const arm::GpX pc = a64::x15;
const arm::GpX cia_addr_reg = a64::x11;
// Load offset value
c.mov(cia_addr_reg, static_cast<u64>(::offset32(&ppu_thread::cia)));
// Update CIA
c.mov(pc.w(), func_addr);
c.str(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
// Multiply by 2 to index into ptr table
c.add(pc, pc, pc);
// Load call target
const arm::GpX call_target = a64::x13;
c.ldr(call_target, arm::Mem(exec_addr, pc));
// Compute REG_Hp
const arm::GpX reg_hp = a64::x21;
c.mov(reg_hp, call_target);
c.lsr(reg_hp, reg_hp, 48);
c.lsl(reg_hp.w(), reg_hp.w(), 13);
// Zero top 16 bits of call target
c.lsl(call_target, call_target, 16);
c.lsr(call_target, call_target, 16);
// Execute LLE call
c.br(call_target);
#endif
}, runtime.get());
code_ptr = reinterpret_cast<u64>(func);
return code_ptr;
};
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
{
info.attr += ppu_attr::has_mfvscr;
}
while (!jit_mod.init && fpos < info.get_funcs().size())
{
// Copy module information
ppu_module<lv2_obj> part;
part.copy_part(info);
part.funcs.reserve(16000);
// Overall block size in bytes
usz bsize = 0;
usz bcount = 0;
while (fpos < info.funcs.size())
while (fpos < info.get_funcs().size())
{
auto& func = info.funcs[fpos];
auto& func = info.get_funcs()[fpos];
if (!func.size)
{
@ -4767,9 +4886,9 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
{
auto far_jump = ensure(g_fxo->get<ppu_far_jumps_t>().gen_jump(source));
if (source == func.addr && jit)
if (source == func.addr)
{
jit->update_global_mapping(fmt::format("__0x%x", func.addr - reloc), reinterpret_cast<u64>(far_jump));
(*shared_map)[func.addr - reloc] = reinterpret_cast<u64>(far_jump);
}
ppu_register_function_at(source, 4, far_jump);
@ -4783,22 +4902,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
}
}
// Copy block or function entry
ppu_function& entry = part.funcs.emplace_back(func);
local_jit_bounds->first = std::min<u32>(local_jit_bounds->first, func.addr);
local_jit_bounds->second = std::max<u32>(local_jit_bounds->second, func.addr + func.size);
part.local_bounds.first = std::min<u32>(part.local_bounds.first, func.addr);
part.local_bounds.second = std::max<u32>(part.local_bounds.second, func.addr + func.size);
// Fixup some information
entry.name = fmt::format("__0x%x", entry.addr - reloc);
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
{
// TODO
entry.attr += ppu_attr::has_mfvscr;
}
if (entry.blocks.empty())
{
entry.blocks.emplace(func.addr, func.size);
}
func.name = fmt::format("__0x%x", func.addr - reloc);
bsize += func.size;
@ -4815,7 +4926,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
int has_dcbz = !!g_cfg.core.accurate_cache_line_stores;
for (const auto& func : part.funcs)
for (const auto& func : part.get_funcs())
{
if (func.size == 0)
{
@ -4827,7 +4938,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
sha1_update(&ctx, reinterpret_cast<const u8*>(&addr), sizeof(addr));
sha1_update(&ctx, reinterpret_cast<const u8*>(&size), sizeof(size));
for (const auto& block : func.blocks)
for (const auto block : func)
{
if (block.second == 0 || reloc)
{
@ -4898,7 +5009,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
sha1_update(&ctx, ensure(info.get_ptr<const u8>(func.addr)), func.size);
}
if (!workload.empty() && fpos >= info.funcs.size())
if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1)
{
// Hash the entire function grouped addresses for the integrity of the symbol resolver function
// Potentially occuring during patches
@ -4906,7 +5017,13 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
std::vector<be_t<u32>> addrs;
for (const ppu_function& func : info.funcs)
constexpr auto compare = [](const ppu_function& a, u32 addr) { return a.addr < addr; };
const auto start = std::lower_bound(info.funcs.begin(), info.funcs.end(), local_jit_bounds->first, compare);
std::span<const ppu_function> span_range{ start, std::lower_bound(start, info.funcs.end(), local_jit_bounds->second, compare) };
for (const ppu_function& func : span_range)
{
if (func.size == 0)
{
@ -4919,7 +5036,13 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
// Hash its size too
addrs.emplace_back(::size32(addrs));
sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
if (module_counter != 0)
{
sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
}
part.jit_bounds = std::move(local_jit_bounds);
local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
}
if (false)
@ -4974,7 +5097,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented");
if (g_cfg.core.ppu_use_nj_bit)
settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented");
if (fpos >= info.funcs.size())
if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1)
settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose
// Write version, hash, CPU, settings
@ -4986,6 +5109,8 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
break;
}
module_counter++;
if (!check_only)
{
total_compile++;
@ -4996,13 +5121,14 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
// Check object file
if (jit_compiler::check(cache_path + obj_name))
{
if (!jit && !check_only)
if (!is_being_used_in_emulation && !check_only)
{
ppu_log.success("LLVM: Module exists: %s", obj_name);
// Done already, revert total amount increase
// Avoid incrementing "pdone" instead because it creates false appreciation for both the progress dialog and the user
total_compile--;
link_workload.pop_back();
}
continue;
@ -5113,11 +5239,26 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
// Keep allocating workload
const auto& [obj_name, part] = std::as_const(workload)[i];
std::shared_lock rlock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
std::unique_lock lock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
if (part.jit_bounds && part.parent->funcs.size() >= 0x8000)
{
// Make a large symbol-resolving function compile alone because it has massive memory requirements
lock.lock();
}
else
{
rlock.lock();
}
ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name);
// Use another JIT instance
jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
ppu_initialize2(jit2, part, cache_path, obj_name, i == workload.size() - 1 ? main_module : part);
{
// Use another JIT instance
jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
ppu_initialize2(jit2, part, cache_path, obj_name);
}
ppu_log.success("LLVM: Compiled module %s", obj_name);
}
@ -5145,6 +5286,17 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
g_watchdog_hold_ctr--;
}
// Initialize compiler instance
while (jits.size() < utils::aligned_div<u64>(module_counter, c_moudles_per_jit) && is_being_used_in_emulation)
{
jits.emplace_back(std::make_shared<jit_compiler>(s_link_table, g_cfg.core.llvm_cpu, 0, symbols_cement));
}
if (jit_mod.symbol_resolvers.empty() && is_being_used_in_emulation)
{
jit_mod.symbol_resolvers.resize(jits.size());
}
bool failed_to_load = false;
{
if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()))
@ -5158,14 +5310,18 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
*progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_LINKING_PPU_MODULES);
}
usz mod_index = umax;
for (const auto& [obj_name, is_compiled] : link_workload)
{
mod_index++;
if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
{
break;
}
if (!failed_to_load && !jit->add(cache_path + obj_name))
if (!failed_to_load && !jits[mod_index / c_moudles_per_jit]->add(cache_path + obj_name))
{
ppu_log.error("LLVM: Failed to load module %s", obj_name);
failed_to_load = true;
@ -5205,10 +5361,10 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_APPLYING_PPU_CODE);
if (!jit)
if (jits.empty())
{
// No functions - nothing to do
ensure(info.funcs.empty());
ensure(info.get_funcs().empty());
return compiled_new;
}
@ -5216,25 +5372,27 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
if (is_first)
{
jit->fin();
}
if (is_first)
{
jit_mod.symbol_resolver = reinterpret_cast<void(*)(u8*, u64)>(jit->get("__resolve_symbols"));
ensure(jit_mod.symbol_resolver);
}
else
{
ensure(jit_mod.symbol_resolver);
for (auto& jit : jits)
{
jit->fin();
}
}
#ifdef __APPLE__
// Symbol resolver is in JIT mem, so we must enable execution
pthread_jit_write_protect_np(true);
#endif
{
usz index = umax;
jit_mod.symbol_resolver(vm::g_exec_addr, info.segs[0].addr);
for (auto& sim : jit_mod.symbol_resolvers)
{
index++;
sim = ensure(!is_first ? sim : reinterpret_cast<void(*)(u8*, u64)>(jits[index]->get("__resolve_symbols")));
sim(vm::g_exec_addr, info.segs[0].addr);
}
}
#ifdef __APPLE__
// Symbol resolver is in JIT mem, so we must enable execution
@ -5242,7 +5400,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
#endif
// Find a BLR-only function in order to copy it to all BLRs (some games need it)
for (const auto& func : info.funcs)
for (const auto& func : info.get_funcs())
{
if (func.size == 4 && *info.get_ptr<u32>(func.addr) == ppu_instructions::BLR())
{
@ -5281,7 +5439,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
#endif
}
static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module<lv2_obj>& whole_module)
static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name)
{
#ifdef LLVM_AVAILABLE
using namespace llvm;
@ -5307,8 +5465,11 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
translator.get_type<u64>(), // r2
}, false);
// Difference between function name and current location
const u32 reloc = module_part.get_relocs().empty() ? 0 : ::at32(module_part.segs, 0).addr;
// Initialize function list
for (const auto& func : module_part.funcs)
for (const auto& func : module_part.get_funcs())
{
if (func.size)
{
@ -5374,8 +5535,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
fpm.addPass(EarlyCSEPass());
#endif
u32 guest_code_size = 0;
u32 min_addr = umax;
u32 max_addr = 0;
u32 num_func = 0;
// Translate functions
for (usz fi = 0, fmax = module_part.funcs.size(); fi < fmax; fi++)
// Start with the lowest bound of the module, function list is sorted
for (const auto& mod_func : module_part.get_funcs())
{
if (Emu.IsStopped())
{
@ -5383,10 +5550,15 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
return;
}
if (module_part.funcs[fi].size)
if (mod_func.size)
{
num_func++;
guest_code_size += mod_func.size;
max_addr = std::max<u32>(max_addr, mod_func.addr + mod_func.size);
min_addr = std::min<u32>(min_addr, mod_func.addr);
// Translate
if (const auto func = translator.Translate(module_part.funcs[fi]))
if (const auto func = translator.Translate(mod_func))
{
#ifdef ARCH_X64 // TODO
// Run optimization passes
@ -5405,10 +5577,10 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
}
}
// Run this only in one module for all functions
if (&whole_module != &module_part)
// Run this only in one module for all functions compiled
if (module_part.jit_bounds)
{
if (const auto func = translator.GetSymbolResolver(whole_module))
if (const auto func = translator.GetSymbolResolver(module_part))
{
#ifdef ARCH_X64 // TODO
// Run optimization passes
@ -5452,7 +5624,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module
return;
}
ppu_log.notice("LLVM: %zu functions generated", _module->getFunctionList().size());
ppu_log.notice("LLVM: %zu functions generated (code_size=0x%x, num_func=%d, max_addr(-)min_addr=0x%x)", _module->getFunctionList().size(), guest_code_size, num_func, max_addr - min_addr);
}
// Load or compile module

View File

@ -114,7 +114,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
const auto caddr = m_info.segs[0].addr;
const auto cend = caddr + m_info.segs[0].size;
for (const auto& rel : m_info.relocs)
for (const auto& rel : m_info.get_relocs())
{
if (rel.addr >= caddr && rel.addr < cend)
{
@ -162,7 +162,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
}
}
if (!m_info.relocs.empty())
if (!m_info.get_relocs().empty())
{
m_reloc = &m_info.segs[0];
}
@ -196,7 +196,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
// Instruction address is (m_addr + base)
const u64 base = m_reloc ? m_reloc->addr : 0;
m_addr = info.addr - base;
m_attr = info.attr;
m_attr = m_info.attr + info.attr;
// Don't emit check in small blocks without terminator
bool need_check = info.size >= 16;
@ -325,6 +325,9 @@ Function* PPUTranslator::Translate(const ppu_function& info)
Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
{
ensure(m_module->getFunction("__resolve_symbols") == nullptr);
ensure(info.jit_bounds);
m_function = cast<Function>(m_module->getOrInsertFunction("__resolve_symbols", FunctionType::get(get_type<void>(), { get_type<u8*>(), get_type<u64>() }, false)).getCallee());
IRBuilder<> irb(BasicBlock::Create(m_context, "__entry", m_function));
@ -351,12 +354,13 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
// This is made in loop instead of inlined because it took tremendous amount of time to compile.
std::vector<u32> vec_addrs;
vec_addrs.reserve(info.funcs.size());
// Create an array of function pointers
std::vector<llvm::Constant*> functions;
for (const auto& f : info.funcs)
const auto [min_addr, max_addr] = *ensure(info.jit_bounds);
for (const auto& f : info.get_funcs(false, true))
{
if (!f.size)
{
@ -379,7 +383,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
const auto addr_array = new GlobalVariable(*m_module, addr_array_type, false, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, vec_addrs));
// Create an array of function pointers
const auto func_table_type = ArrayType::get(ftype->getPointerTo(), info.funcs.size());
const auto func_table_type = ArrayType::get(ftype->getPointerTo(), functions.size());
const auto init_func_table = ConstantArray::get(func_table_type, functions);
const auto func_table = new GlobalVariable(*m_module, func_table_type, false, GlobalVariable::PrivateLinkage, init_func_table);

View File

@ -310,6 +310,11 @@ namespace utils
void memory_commit(void* pointer, usz size, protection prot)
{
if (!size)
{
return;
}
#ifdef _WIN32
ensure(::VirtualAlloc(pointer, size, MEM_COMMIT, +prot));
#else
@ -329,6 +334,11 @@ namespace utils
void memory_decommit(void* pointer, usz size)
{
if (!size)
{
return;
}
#ifdef _WIN32
ensure(::VirtualFree(pointer, size, MEM_DECOMMIT));
#else
@ -357,6 +367,11 @@ namespace utils
void memory_reset(void* pointer, usz size, protection prot)
{
if (!size)
{
return;
}
#ifdef _WIN32
memory_decommit(pointer, size);
memory_commit(pointer, size, prot);
@ -390,6 +405,11 @@ namespace utils
void memory_release(void* pointer, usz size)
{
if (!size)
{
return;
}
#ifdef _WIN32
unmap_mappping_memory(reinterpret_cast<u64>(pointer), size);
ensure(::VirtualFree(pointer, 0, MEM_RELEASE));
@ -400,6 +420,11 @@ namespace utils
void memory_protect(void* pointer, usz size, protection prot)
{
if (!size)
{
return;
}
#ifdef _WIN32
DWORD old;
@ -429,6 +454,11 @@ namespace utils
bool memory_lock(void* pointer, usz size)
{
if (!size)
{
return true;
}
#ifdef _WIN32
return ::VirtualLock(pointer, size);
#else