diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 18798793b3..2f4762e7fa 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3703,13 +3703,18 @@ public: return result; } + llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i, llvm::Type* type = nullptr) + { + return m_ir->CreateLoad(type ? type : g->getValueType(), m_ir->CreateGEP(g->getValueType(), g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type())})); + } + template llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i) { - return m_ir->CreateLoad(get_type(), m_ir->CreateGEP(g->getValueType(), g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type())})); + return load_const(g, i, get_type()); } - template + template requires requires () { std::declval().eval(std::declval*>()); } value_t load_const(llvm::GlobalVariable* g, I i) { value_t result; @@ -3717,6 +3722,12 @@ public: return result; } + template + llvm::GlobalVariable* make_local_variable(T initializing_value) + { + return new llvm::GlobalVariable(*m_module, get_type(), false, llvm::GlobalVariable::PrivateLinkage, llvm::ConstantInt::get(get_type(), initializing_value)); + } + template std::pair get_const_vector(llvm::Value*, u32 pos, u32 = __builtin_LINE()); diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 956da1d966..35ab2ad4ef 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -175,7 +175,7 @@ bool serialize(utils::serial& ar, typename ppu_thread::cr_b extern void ppu_initialize(); extern void ppu_finalize(const ppu_module& info); extern bool ppu_initialize(const ppu_module& info, bool check_only = false, u64 file_size = 0); -static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name); +static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module); extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr); extern std::pair, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr); extern void ppu_unload_prx(const lv2_prx&); @@ -3460,7 +3460,7 @@ namespace // Compiled PPU module info struct jit_module { - std::vector funcs; + void(*symbol_resolver)(u8*, u64) = nullptr; std::shared_ptr pjit; bool init = false; }; @@ -3502,7 +3502,6 @@ namespace return; } - to_destroy.funcs = std::move(found->second.funcs); to_destroy.pjit = std::move(found->second.pjit); bucket.map.erase(found); @@ -4611,6 +4610,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) accurate_fpcc, accurate_vnan, accurate_nj_mode, + contains_symbol_resolver, __bitset_enum_max }; @@ -4640,6 +4640,8 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented"); if (g_cfg.core.ppu_use_nj_bit) settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented"); + if (fpos >= info.funcs.size()) + settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose // Write version, hash, CPU, settings fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); @@ -4724,16 +4726,18 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) { atomic_t& work_cv; std::vector>& workload; + const ppu_module& main_module; const std::string& cache_path; const cpu_thread* cpu; std::unique_lock core_lock; thread_op(atomic_t& work_cv, std::vector>& workload - , const cpu_thread* cpu, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept + , const cpu_thread* cpu, const ppu_module& main_module, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept : work_cv(work_cv) , workload(workload) + , main_module(main_module) , cache_path(cache_path) , cpu(cpu) { @@ -4744,6 +4748,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) thread_op(const thread_op& other) noexcept : work_cv(other.work_cv) , workload(other.workload) + , main_module(other.main_module) , cache_path(other.cache_path) , cpu(other.cpu) { @@ -4778,7 +4783,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) // Use another JIT instance jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1); - ppu_initialize2(jit2, part, cache_path, obj_name); + ppu_initialize2(jit2, part, cache_path, obj_name, i == workload.size() - 1 ? main_module : part); ppu_log.success("LLVM: Compiled module %s", obj_name); } @@ -4791,7 +4796,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) g_watchdog_hold_ctr++; named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get().index), thread_count - , thread_op(work_cv, workload, cpu, cache_path, g_fxo->get().sem) + , thread_op(work_cv, workload, cpu, info, cache_path, g_fxo->get().sem) , [&](u32 /*thread_index*/, thread_op& op) { // Allocate "core" @@ -4835,8 +4840,6 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) } } - progr.reset(); - if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())) { return compiled_new; @@ -4851,83 +4854,39 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) const bool is_first = jit && !jit_mod.init; + const bool showing_only_apply_stage = !g_progr.load() && !g_progr_ptotal && !g_progr_ftotal && g_progr_ptotal.compare_and_swap_test(0, 1); + + progr.emplace("Applying PPU Code..."); + if (is_first) { jit->fin(); } - u32 index = 0; - u32 max_count = 0; - - for (const auto& func : info.funcs) + if (is_first) { - if (func.size) - { - max_count++; - } + jit_mod.symbol_resolver = reinterpret_cast(jit->get("__resolve_symbols")); + } + else + { + ensure(jit_mod.symbol_resolver); } - u32 pending_progress = umax; + jit_mod.symbol_resolver(vm::g_exec_addr, info.segs[0].addr); - bool early_exit = false; - - // Get and install function addresses + // Find a BLR-only function in order to copy it to all BLRs (some games need it) for (const auto& func : info.funcs) { - if (!func.size) + if (func.size == 4 && *info.get_ptr(func.addr) == ppu_instructions::BLR()) { - continue; - } + const auto name = fmt::format("__0x%x", func.addr - reloc); - if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()) - { - // Revert partially commited changes - jit_mod.funcs.clear(); - BLR_func = nullptr; - early_exit = true; + BLR_func = reinterpret_cast(jit->get(name)); break; } - - const auto name = fmt::format("__0x%x", func.addr - reloc); - - // Try to locate existing function if it is not the first time - const auto addr = is_first ? ensure(reinterpret_cast(jit->get(name))) - : reinterpret_cast(ensure(jit_mod.funcs[index])); - - jit_mod.funcs.emplace_back(addr); - - if (func.size == 4 && !BLR_func && *info.get_ptr(func.addr) == ppu_instructions::BLR()) - { - BLR_func = addr; - } - - ppu_register_function_at(func.addr, 4, addr); - - if (g_cfg.core.ppu_debug) - ppu_log.trace("Installing function %s at 0x%x: %p (reloc = 0x%x)", name, func.addr, ppu_ref(func.addr), reloc); - - index++; - - if (pending_progress != umax) - { - pending_progress++; - - if (pending_progress == 1024) - { - pending_progress = 0; - g_progr_pdone++; - } - } - else if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal) - { - g_progr_pdone += index / 1024; - g_progr_ptotal += max_count / 1024; - pending_progress = index % 1024; - progr.emplace("Applying PPU Code..."); - } } - if (is_first && !early_exit) + if (is_first) { jit_mod.init = true; } @@ -4945,13 +4904,19 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) } } + if (showing_only_apply_stage) + { + // Done + g_progr_pdone++; + } + return compiled_new; #else fmt::throw_exception("LLVM is not available in this build."); #endif } -static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name) +static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module) { #ifdef LLVM_AVAILABLE using namespace llvm; @@ -5042,6 +5007,21 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co } } + // Run this only in one module for all functions + if (&whole_module != &module_part) + { + if (const auto func = translator.GetSymbolResolver(whole_module)) + { + // Run optimization passes + pm.run(*func); + } + else + { + Emu.Pause(); + return; + } + } + //legacy::PassManager mpm; // Remove unused functions, structs, global variables, etc diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 6441349a19..0517a35530 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -274,6 +274,125 @@ Function* PPUTranslator::Translate(const ppu_function& info) return m_function; } +Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) +{ + m_function = cast(m_module->getOrInsertFunction("__resolve_symbols", FunctionType::get(get_type(), { get_type(), get_type() }, false)).getCallee()); + + IRBuilder<> irb(BasicBlock::Create(m_context, "__entry", m_function)); + m_ir = &irb; + + // Instruction address is (m_addr + base) + const u64 base = m_reloc ? m_reloc->addr : 0; + + m_exec = m_function->getArg(0); + m_seg0 = m_function->getArg(1); + + const auto ftype = FunctionType::get(get_type(), { + get_type(), // Exec base + GetContextType()->getPointerTo(), // PPU context + get_type(), // Segment address (for PRX) + get_type(), // Memory base + get_type(), // r0 + get_type(), // r1 + get_type(), // r2 + }, false); + + // Store function addresses in PPU jumptable using internal resolving instead of patching it externally. + // Because, LLVM processed it extremely slow. (regression) + // This is made in loop instead of inlined because it took tremendous amount of time to compile. + + std::vector vec_addrs; + vec_addrs.reserve(info.funcs.size()); + + // Create an array of function pointers + std::vector functions; + + for (const auto& f : info.funcs) + { + if (!f.size) + { + continue; + } + + vec_addrs.push_back(f.addr - base); + functions.push_back(cast(m_module->getOrInsertFunction(fmt::format("__0x%x", f.addr - base), ftype).getCallee())); + } + + if (vec_addrs.empty()) + { + // Possible special case for no functions (allowing the do-while optimization) + m_ir->CreateRetVoid(); + replace_intrinsics(*m_function); + return m_function; + } + + const auto addr_array_type = ArrayType::get(get_type(), vec_addrs.size()); + const auto addr_array = new GlobalVariable(*m_module, addr_array_type, false, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, vec_addrs)); + + // Initialize the function table with the function pointers + std::vector init_vals; + + for (llvm::Function* func : functions) + { + llvm::Constant* func_ptr = llvm::ConstantExpr::getBitCast(func, ftype->getPointerTo()); + init_vals.push_back(func); + } + + // Create an array of function pointers + const auto func_table_type = ArrayType::get(ftype->getPointerTo(), info.funcs.size()); + const auto init_func_table = ConstantArray::get(func_table_type, init_vals); + const auto func_table = new GlobalVariable(*m_module, func_table_type, false, GlobalVariable::PrivateLinkage, init_func_table); + + const auto loop_block = BasicBlock::Create(m_context, "__loop", m_function); + const auto after_loop = BasicBlock::Create(m_context, "__after_loop", m_function); + + m_ir->CreateBr(loop_block); + m_ir->SetInsertPoint(loop_block); + + const auto init_index_value = m_ir->getInt64(0); + + // Loop body + const auto body_block = BasicBlock::Create(m_context, "__body", m_function); + + m_ir->CreateBr(body_block); // As do-while because vec_addrs is known to be more than 0 + m_ir->SetInsertPoint(body_block); + + const auto index_value = m_ir->CreatePHI(get_type(), 2); + index_value->addIncoming(init_index_value, loop_block); + + auto ptr_inst = dyn_cast(m_ir->CreateGEP(addr_array->getValueType(), addr_array, {m_ir->getInt64(0), index_value})); + assert(ptr_inst->getResultElementType() == get_type()); + + const auto func_pc = ZExt(m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst), get_type()); + + ptr_inst = dyn_cast(m_ir->CreateGEP(func_table->getValueType(), func_table, {m_ir->getInt64(0), index_value})); + assert(ptr_inst->getResultElementType() == ftype->getPointerTo()); + + const auto faddr = m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst); + const auto faddr_int = m_ir->CreatePtrToInt(faddr, get_type()); + const auto fval = m_ir->CreateOr(m_ir->CreateShl(m_seg0, 32 + 3), faddr_int); + const auto pos = m_ir->CreateShl(m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc, 1); + const auto ptr = dyn_cast(m_ir->CreateGEP(get_type(), m_exec, pos)); + + // Store to jumptable + m_ir->CreateStore(fval, ptr); + + // Increment index and branch back to loop + const auto post_add = m_ir->CreateAdd(index_value, m_ir->getInt64(1)); + index_value->addIncoming(post_add, body_block); + + Value* index_check = m_ir->CreateICmpULT(post_add, m_ir->getInt64(vec_addrs.size())); + m_ir->CreateCondBr(index_check, body_block, after_loop); + + // Set insertion point to afterloop_block + m_ir->SetInsertPoint(after_loop); + + m_ir->CreateRetVoid(); + + replace_intrinsics(*m_function); + return m_function; +} + Value* PPUTranslator::VecHandleNan(Value* val) { const auto is_nan = m_ir->CreateFCmpUNO(val, val); diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index d72dc1de20..d3c8d877a6 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -336,6 +336,7 @@ public: // Parses PPU opcodes and translate them into LLVM IR llvm::Function* Translate(const ppu_function& info); + llvm::Function* GetSymbolResolver(const ppu_module& info); void MFVSCR(ppu_opcode_t op); void MTVSCR(ppu_opcode_t op);