SPU: optimize function dispatch in trampolines

Add a top-level hashtable
This commit is contained in:
Nekotekina 2019-10-03 19:57:32 +03:00
parent 8031180373
commit 239f53568c
2 changed files with 83 additions and 43 deletions

View File

@ -91,11 +91,60 @@ DECLARE(spu_runtime::tr_interpreter) = []
DECLARE(spu_runtime::g_dispatcher) = []
{
const auto ptr = reinterpret_cast<decltype(spu_runtime::g_dispatcher)>(jit_runtime::alloc(sizeof(spu_function_t), 8, false));
ptr->raw() = tr_dispatch;
// Allocate 2^20 positions in data area
const auto ptr = reinterpret_cast<decltype(g_dispatcher)>(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false));
for (auto& x : *ptr)
{
x.raw() = tr_dispatch;
}
return ptr;
}();
DECLARE(spu_runtime::tr_all) = []
{
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = trptr;
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// mov eax, [rcx]
*raw++ = 0x8b;
*raw++ = 0x01;
// shr eax, (32 - 20)
*raw++ = 0xc1;
*raw++ = 0xe8;
*raw++ = 0x0c;
// Load g_dispatcher to rdx
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x15;
const s32 r32 = ::narrow<s32>(reinterpret_cast<u64>(g_dispatcher) - reinterpret_cast<u64>(raw) - 4, HERE);
std::memcpy(raw, &r32, 4);
raw += 4;
// jmp [rdx + rax * 8]
*raw++ = 0xff;
*raw++ = 0x24;
*raw++ = 0xc2;
return reinterpret_cast<spu_function_t>(trptr);
}();
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
@ -131,9 +180,8 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::
c.push(x86::rax);
#endif
// Load g_dispatcher pointer to call g_dispatcher[0]
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher));
c.mov(x86::rax, x86::qword_ptr(x86::rax));
// Load tr_all function pointer to call actual compiled function
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::tr_all));
// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
@ -300,7 +348,10 @@ void spu_cache::initialize()
if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast)
{
*spu_runtime::g_dispatcher = spu_runtime::tr_interpreter;
for (auto& x : *spu_runtime::g_dispatcher)
{
x.raw() = spu_runtime::tr_interpreter;
}
}
const std::string ppu_cache = Emu.PPUCache();
@ -463,9 +514,6 @@ void spu_cache::initialize()
if (compilers.size() && !func_list.empty())
{
LOG_NOTICE(SPU, "SPU Runtime: Building trampoline...");
spu_runtime::g_dispatcher[0] = compilers[0]->get_runtime().rebuild_ubertrampoline();
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
}
@ -568,12 +616,12 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
// Register function in PIC map
m_pic_map[{func.data() + _off, func.size() - _off}] = compiled;
if (g_fxo->get<spu_cache>())
if (func.size() > 1)
{
// Rebuild trampolines if necessary
if (const auto new_tr = rebuild_ubertrampoline())
if (const auto new_tr = rebuild_ubertrampoline(func[1]))
{
g_dispatcher[0] = new_tr;
g_dispatcher->at(func[1] >> 12) = new_tr;
}
else
{
@ -586,11 +634,17 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
return true;
}
spu_function_t spu_runtime::rebuild_ubertrampoline()
spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
{
// Prepare sorted list
m_flat_list.clear();
m_flat_list.assign(m_pic_map.cbegin(), m_pic_map.cend());
{
// Select required subrange (fixed 20 bits for single pos in g_dispatcher table)
const u32 id_lower = id_inst & ~0xfff;
const u32 id_upper = id_inst | 0xfff;
m_flat_list.assign(m_pic_map.lower_bound({&id_lower, 1}), m_pic_map.upper_bound({&id_upper, 1}));
}
struct work
{
@ -661,18 +715,7 @@ spu_function_t spu_runtime::rebuild_ubertrampoline()
workload.back().beg = beg;
workload.back().end = _end;
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all)
for (std::size_t i = 0; i < workload.size(); i++)
{
@ -1098,7 +1141,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
if (rip)
{
const s64 rel = reinterpret_cast<u64>(spu_runtime::g_dispatcher) - reinterpret_cast<u64>(rip - 8) - 6;
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_all) - reinterpret_cast<u64>(rip - 8) - 5;
union
{
@ -1106,9 +1149,9 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
u64 result;
};
bytes[0] = 0xff; // jmp [rip + 0x...]
bytes[1] = 0x25;
std::memcpy(bytes + 2, &rel, 4);
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
bytes[5] = 0x90;
bytes[6] = 0x90;
bytes[7] = 0x90;
@ -1116,7 +1159,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
}
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher[0])
if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher->at(spu._ref<nse_t<u32>>(spu.pc) >> 12))
{
spu.block_recover = spu.block_counter;
return;
@ -4388,13 +4431,8 @@ public:
const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc});
entry_call->setCallingConv(entry_chunk->chunk->getCallingConv());
#ifdef _WIN32
// TODO: fix this mess
const auto dispatcher = m_ir->CreateIntToPtr(m_ir->getInt64((u64)+spu_runtime::g_dispatcher), get_type<u8**>());
#else
const auto dispatcher = new llvm::GlobalVariable(*m_module, get_type<u8*>(), true, GlobalValue::ExternalLinkage, nullptr, "spu_dispatcher");
m_engine->addGlobalMapping("spu_dispatcher", (u64)+spu_runtime::g_dispatcher);
#endif
const auto dispatcher = llvm::cast<llvm::Function>(m_module->getOrInsertFunction("spu_dispatcher", main_func->getType()).getCallee());
m_engine->addGlobalMapping("spu_dispatcher", reinterpret_cast<u64>(spu_runtime::tr_all));
// Proceed to the next code
if (entry_chunk->chunk->getReturnType() != get_type<void>())
@ -4436,15 +4474,14 @@ public:
if (entry_chunk->chunk->getReturnType() == get_type<void>())
{
const auto next_func = m_ir->CreateLoad(dispatcher);
const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(next_func, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)});
const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(dispatcher, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)});
next_call->setCallingConv(main_func->getCallingConv());
next_call->setTailCall();
m_ir->CreateRetVoid();
}
else
{
m_ir->CreateRet(m_ir->CreateLoad(dispatcher));
m_ir->CreateRet(m_ir->CreateBitCast(dispatcher, get_type<u8*>()));
}
// Function that executes check_state and escapes if necessary

View File

@ -77,6 +77,9 @@ public:
// Trampoline to legacy interpreter
static const spu_function_t tr_interpreter;
// Detect and call any recompiled function
static const spu_function_t tr_all;
public:
spu_runtime();
@ -93,7 +96,7 @@ public:
bool add(u64 last_reset_count, void* where, spu_function_t compiled);
private:
spu_function_t rebuild_ubertrampoline();
spu_function_t rebuild_ubertrampoline(u32 id_inst);
friend class spu_cache;
public:
@ -120,7 +123,7 @@ public:
void handle_return(spu_thread* _spu);
// All dispatchers (array allocated in jit memory)
static atomic_t<spu_function_t>* const g_dispatcher;
static std::array<atomic_t<spu_function_t>, (1 << 20)>* const g_dispatcher;
// Recompiler entry point
static const spu_function_t g_gateway;