PPU LLVM: improve analyser

Compile possibly executable holes between detected functions.
Add unused "PPU LLVM Greedy Mode" option (for future updates).
Add "nounwind" attribute to compiled functions (reduces size).
This commit is contained in:
Nekotekina 2021-01-19 20:40:15 +03:00
parent e71c2df39d
commit f9ee8978ff
6 changed files with 256 additions and 68 deletions

View File

@ -7,6 +7,7 @@
#include "mutex.h"
#include "util/vm.hpp"
#include "util/asm.hpp"
#include <charconv>
#include <immintrin.h>
#include <zlib.h>
@ -310,12 +311,29 @@ const bool jit_initialize = []() -> bool
fmt::throw_exception("Null function: %s", name);
}
namespace vm
{
extern u8* const g_sudo_addr;
}
static shared_mutex null_mtx;
static std::unordered_map<std::string, u64> null_funcs;
static u64 make_null_function(const std::string& name)
{
if (name.starts_with("__0x"))
{
u32 addr = -1;
auto res = std::from_chars(name.c_str() + 4, name.c_str() + name.size(), addr, 16);
if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < 0x8000'0000)
{
// Point the garbage to reserved, non-executable memory
return reinterpret_cast<u64>(vm::g_sudo_addr + addr);
}
}
std::lock_guard lock(null_mtx);
if (u64& func_ptr = null_funcs[name]) [[likely]]
@ -376,8 +394,12 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
if (!addr)
{
jit_log.error("Function '%s' linked but not found.", name);
addr = make_null_function(name);
if (!addr)
{
fmt::throw_exception("Failed to link '%s'", name);
}
}
return {addr, llvm::JITSymbolFlags::Exported};
@ -453,8 +475,12 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
if (!addr)
{
jit_log.error("Function '%s' linked but not found.", name);
addr = make_null_function(name);
if (!addr)
{
fmt::throw_exception("Failed to link '%s' (MM2)", name);
}
}
return {addr, llvm::JITSymbolFlags::Exported};
@ -730,7 +756,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
for (auto&& [name, addr] : _link)
{
m_engine->addGlobalMapping(name, addr);
m_engine->updateGlobalMapping(name, addr);
}
}

View File

@ -3,6 +3,7 @@
#include "PPUOpcodes.h"
#include "PPUModule.h"
#include "Emu/system_config.h"
#include <unordered_set>
#include "util/yaml.hpp"
@ -577,7 +578,6 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
func_queue.emplace_back(func);
func.addr = addr;
func.toc = toc;
func.name = fmt::format("__0x%x", func.addr);
ppu_log.trace("Function 0x%x added (toc=0x%x)", addr, toc);
return func;
};
@ -1432,7 +1432,7 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
// Just ensure that functions don't overlap
if (func.addr + func.size > next)
{
ppu_log.warning("Function overlap: [0x%x] 0x%x -> 0x%x", func.addr, func.size, next - func.addr);
ppu_log.trace("Function overlap: [0x%x] 0x%x -> 0x%x", func.addr, func.size, next - func.addr);
continue; //func.size = next - func.addr;
// Also invalidate blocks
@ -1502,7 +1502,7 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
if (_ptr.addr() >= next)
{
ppu_log.warning("Function gap: [0x%x] 0x%x bytes at 0x%x", func.addr, next - start, start);
ppu_log.trace("Function gap: [0x%x] 0x%x bytes at 0x%x", func.addr, next - start, start);
break;
}
}
@ -1522,15 +1522,200 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
}
}
// Convert map to vector (destructive)
for (auto&& pair : fmap)
ppu_log.notice("Function analysis: %zu functions (%zu enqueued)", fmap.size(), func_queue.size());
// Decompose functions to basic blocks
for (auto&& [_, func] : as_rvalue(std::move(fmap)))
{
auto& func = pair.second;
ppu_log.trace("Function %s (size=0x%x, toc=0x%x, attr %#x)", func.name, func.size, func.toc, func.attr);
funcs.emplace_back(std::move(func));
for (auto [addr, size] : func.blocks)
{
if (!size)
{
continue;
}
ppu_log.notice("Function analysis: %zu functions (%zu enqueued)", funcs.size(), func_queue.size());
auto& block = fmap[addr];
if (block.addr || block.size)
{
ppu_log.trace("Block __0x%x exists (size=0x%x)", block.addr, block.size);
continue;
}
block.addr = addr;
block.size = size;
block.toc = func.toc;
ppu_log.trace("Block __0x%x added (func=0x%x, size=0x%x, toc=0x%x)", block.addr, _, block.size, block.toc);
}
}
// Simple callable block analysis
std::vector<std::pair<u32, u32>> block_queue;
block_queue.reserve(128000);
std::unordered_set<u32> block_set;
u32 exp = start;
u32 lim = end;
// Start with full scan
block_queue.emplace_back(exp, lim);
// block_queue may grow
for (usz i = 0; i < block_queue.size(); i++)
{
std::tie(exp, lim) = block_queue[i];
if (lim == 0)
{
// Find next function
const auto found = fmap.upper_bound(exp);
if (found != fmap.cend())
{
lim = found->first;
}
ppu_log.trace("Block rescan: addr=0x%x, lim=0x%x", exp, lim);
}
while (exp < lim)
{
u32 i_pos = exp;
bool is_good = true;
for (; i_pos < lim; i_pos += 4)
{
const u32 opc = vm::_ref<u32>(i_pos);
switch (auto type = s_ppu_itype.decode(opc))
{
case ppu_itype::UNK:
case ppu_itype::ECIWX:
case ppu_itype::ECOWX:
{
// Seemingly bad instruction, skip this block
is_good = false;
break;
}
case ppu_itype::TD:
case ppu_itype::TDI:
case ppu_itype::TW:
case ppu_itype::TWI:
case ppu_itype::B:
case ppu_itype::BC:
{
if (type == ppu_itype::B || type == ppu_itype::BC)
{
if (entry == 0 && ppu_opcode_t{opc}.aa)
{
// Ignore absolute branches in PIC (PRX)
is_good = false;
break;
}
const u32 target = (opc & 2 ? 0 : i_pos) + (type == ppu_itype::B ? +ppu_opcode_t{opc}.bt24 : +ppu_opcode_t{opc}.bt14);
if (target < start || target >= end)
{
// Sanity check
is_good = false;
break;
}
const auto found = fmap.find(target);
if (target != i_pos && found == fmap.cend())
{
if (block_set.count(target) == 0)
{
ppu_log.trace("Block target found: 0x%x (i_pos=0x%x)", target, i_pos);
block_queue.emplace_back(target, 0);
block_set.emplace(target);
}
}
}
[[fallthrough]];
}
case ppu_itype::BCCTR:
case ppu_itype::BCLR:
case ppu_itype::SC:
{
if (type == ppu_itype::SC && opc != ppu_instructions::SC(0))
{
// Strict garbage filter
is_good = false;
break;
}
if (type == ppu_itype::BCCTR && opc & 0xe000)
{
// Garbage filter
is_good = false;
break;
}
if (type == ppu_itype::BCLR && opc & 0xe000)
{
// Garbage filter
is_good = false;
break;
}
// Good block terminator found, add single block
break;
}
default:
{
// Normal instruction: keep scanning
continue;
}
}
break;
}
if (i_pos < lim)
{
i_pos += 4;
}
if (is_good)
{
auto& block = fmap[exp];
if (!block.addr)
{
block.addr = exp;
block.size = i_pos - exp;
ppu_log.trace("Block __0x%x added (size=0x%x)", block.addr, block.size);
}
}
exp = i_pos;
}
}
// Remove overlaps in blocks
for (auto it = fmap.begin(), end = fmap.end(); it != fmap.end(); it++)
{
const auto next = std::next(it);
if (next != end && next->first < it->first + it->second.size)
{
it->second.size = next->first - it->first;
}
}
// Convert map to vector (destructive)
for (auto&& pair : as_rvalue(std::move(fmap)))
{
funcs.emplace_back(std::move(pair.second));
}
ppu_log.notice("Block analysis: %zu blocks (%zu enqueued)", funcs.size(), block_queue.size());
}
void ppu_acontext::UNK(ppu_opcode_t op)

View File

@ -467,7 +467,7 @@ static auto ppu_load_exports(ppu_linkage_info* link, u32 exports_start, u32 expo
if (i < lib.num_func)
{
ppu_loader.notice("** Special: [%s] at 0x%x", ppu_get_function_name({}, nid), addr);
ppu_loader.notice("** Special: [%s] at 0x%x [0x%x, 0x%x]", ppu_get_function_name({}, nid), addr, vm::_ref<u32>(addr), vm::_ref<u32>(addr + 4));
}
else
{

View File

@ -2595,30 +2595,41 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
// Overall block size in bytes
usz bsize = 0;
usz bcount = 0;
while (fpos < info.funcs.size())
{
auto& func = info.funcs[fpos];
if (!func.size)
{
fpos++;
continue;
}
if (bsize + func.size > 100 * 1024 && bsize)
{
if (bcount >= 1000)
{
break;
}
for (auto&& block : func.blocks)
{
bsize += block.second;
// Also split functions blocks into functions (TODO)
ppu_function entry;
entry.addr = block.first;
entry.size = block.second;
entry.toc = func.toc;
fmt::append(entry.name, "__0x%x", block.first - reloc);
part.funcs.emplace_back(std::move(entry));
}
// Copy block or function entry
ppu_function& entry = part.funcs.emplace_back(func);
// Fixup some information
entry.name = fmt::format("__0x%x", entry.addr - reloc);
if (entry.blocks.empty())
{
entry.blocks.emplace(func.addr, func.size);
}
bsize += func.size;
fpos++;
bcount++;
}
// Compute module hash to generate (hopefully) unique object name
@ -2726,6 +2737,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
java_mode_handling,
accurate_cache_line_stores,
reservations_128_byte,
greedy_mode,
__bitset_enum_max
};
@ -2736,25 +2748,17 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
settings += ppu_settings::non_win32;
#endif
if (g_cfg.core.llvm_accurate_dfma)
{
settings += ppu_settings::accurate_fma;
}
if (g_cfg.core.llvm_ppu_accurate_vector_nan)
{
settings += ppu_settings::accurate_ppu_vector_nan;
}
if (g_cfg.core.llvm_ppu_jm_handling)
{
settings += ppu_settings::java_mode_handling;
}
if (has_dcbz == 2)
{
settings += ppu_settings::accurate_cache_line_stores;
}
if (g_cfg.core.ppu_128_reservations_loop_max_length)
{
settings += ppu_settings::reservations_128_byte;
}
if (g_cfg.core.ppu_llvm_greedy_mode)
settings += ppu_settings::greedy_mode;
// Write version, hash, CPU, settings
fmt::append(obj_name, "v3-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
@ -2899,21 +2903,15 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
{
if (!func.size) continue;
for (const auto& block : func.blocks)
{
if (block.second)
{
const u64 addr = jit->get(fmt::format("__0x%x", block.first - reloc));
const u64 addr = ensure(jit->get(fmt::format("__0x%x", func.addr - reloc)));
jit_mod.funcs.emplace_back(reinterpret_cast<ppu_function_t>(addr));
ppu_ref(block.first) = addr;
}
}
ppu_ref(func.addr) = addr;
}
// Initialize global variables
for (auto& var : globals)
{
const u64 addr = jit->get(var.first);
const u64 addr = ensure(jit->get(var.first));
jit_mod.vars.emplace_back(reinterpret_cast<u64*>(addr));
@ -2932,13 +2930,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
{
if (!func.size) continue;
for (const auto& block : func.blocks)
{
if (block.second)
{
ppu_ref(block.first) = reinterpret_cast<uptr>(jit_mod.funcs[index++]);
}
}
ppu_ref(func.addr) = ensure(reinterpret_cast<uptr>(jit_mod.funcs[index++]));
}
index = 0;
@ -2989,6 +2981,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
{
const auto f = cast<Function>(_module->getOrInsertFunction(func.name, _func).getCallee());
f->addAttribute(1, Attribute::NoAlias);
f->addFnAttr(Attribute::NoUnwind);
}
}

View File

@ -1975,7 +1975,6 @@ void PPUTranslator::SC(ppu_opcode_t op)
if (index < 1024)
{
// Call the syscall directly
Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail);
m_ir->CreateRetVoid();
return;
@ -2491,7 +2490,6 @@ void PPUTranslator::MFOCRF(ppu_opcode_t op)
if (pos >= 8 || 0x80u >> pos != op.crm)
{
CompilationError("MFOCRF: Undefined behaviour");
SetGpr(op.rd, UndefValue::get(GetType<u64>()));
return;
}
@ -2771,7 +2769,6 @@ void PPUTranslator::MTOCRF(ppu_opcode_t op)
if (pos >= 8 || 0x80u >> pos != op.crm)
{
CompilationError("MTOCRF: Undefined behaviour");
return;
}
}
@ -3220,7 +3217,6 @@ void PPUTranslator::LDBRX(ppu_opcode_t op)
void PPUTranslator::LSWX(ppu_opcode_t op)
{
CompilationError("Unsupported instruction LSWX. Please report.");
Call(GetType<void>(), "__lswx_not_supported", m_ir->getInt32(op.rd), RegLoad(m_cnt), op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb));
}
@ -3338,7 +3334,6 @@ void PPUTranslator::STDBRX(ppu_opcode_t op)
void PPUTranslator::STSWX(ppu_opcode_t op)
{
CompilationError("Unsupported instruction STSWX. Please report.");
Call(GetType<void>(), "__stswx_not_supported", m_ir->getInt32(op.rs), RegLoad(m_cnt), op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb));
}
@ -4154,8 +4149,6 @@ void PPUTranslator::FNMADDS(ppu_opcode_t op)
void PPUTranslator::MTFSB1(ppu_opcode_t op)
{
CompilationError("MTFSB1");
SetFPSCRBit(op.crbd, m_ir->getTrue(), true);
if (op.rc) SetCrFieldFPCC(1);
@ -4163,8 +4156,6 @@ void PPUTranslator::MTFSB1(ppu_opcode_t op)
void PPUTranslator::MCRFS(ppu_opcode_t op)
{
CompilationError("MCRFS");
const auto lt = GetFPSCRBit(op.crfs * 4 + 0);
const auto gt = GetFPSCRBit(op.crfs * 4 + 1);
const auto eq = GetFPSCRBit(op.crfs * 4 + 2);
@ -4174,8 +4165,6 @@ void PPUTranslator::MCRFS(ppu_opcode_t op)
void PPUTranslator::MTFSB0(ppu_opcode_t op)
{
CompilationError("MTFSB0");
SetFPSCRBit(op.crbd, m_ir->getFalse(), false);
if (op.rc) SetCrFieldFPCC(1);
@ -4183,8 +4172,6 @@ void PPUTranslator::MTFSB0(ppu_opcode_t op)
void PPUTranslator::MTFSFI(ppu_opcode_t op)
{
CompilationError("MTFSFI");
SetFPSCRBit(op.crfd * 4 + 0, m_ir->getInt1((op.i & 8) != 0), false);
if (op.crfd != 0) SetFPSCRBit(op.crfd * 4 + 1, m_ir->getInt1((op.i & 4) != 0), false);
if (op.crfd != 0) SetFPSCRBit(op.crfd * 4 + 2, m_ir->getInt1((op.i & 2) != 0), false);
@ -4195,8 +4182,6 @@ void PPUTranslator::MTFSFI(ppu_opcode_t op)
void PPUTranslator::MFFS(ppu_opcode_t op)
{
ppu_log.warning("LLVM: [0x%08x] Warning: MFFS", m_addr + (m_reloc ? m_reloc->addr : 0));
Value* result = m_ir->getInt64(0);
for (u32 i = 16; i < 20; i++)
@ -4211,8 +4196,6 @@ void PPUTranslator::MFFS(ppu_opcode_t op)
void PPUTranslator::MTFSF(ppu_opcode_t op)
{
ppu_log.warning("LLVM: [0x%08x] Warning: MTFSF", m_addr + (m_reloc ? m_reloc->addr : 0));
const auto value = GetFpr(op.frb, 32, true);
for (u32 i = 16; i < 20; i++)

View File

@ -32,6 +32,7 @@ struct cfg_root : cfg::node
cfg::_bool llvm_logs{ this, "Save LLVM logs" };
cfg::string llvm_cpu{ this, "Use LLVM CPU" };
cfg::_int<0, INT32_MAX> llvm_threads{ this, "Max LLVM Compile Threads", 0 };
cfg::_bool ppu_llvm_greedy_mode{ this, "PPU LLVM Greedy Mode", false, false };
cfg::_bool thread_scheduler_enabled{ this, "Enable thread scheduler", thread_scheduler_enabled_def };
cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false };
cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm };