SPU: implement recompiler gateway function in assembly

Use GHC calling convention directly for SPU object entry points.
This may address performance degradation after #5923.
This commit is contained in:
Nekotekina 2019-05-11 19:21:07 +03:00
parent a74fd27e3d
commit f33b81545e
3 changed files with 176 additions and 67 deletions

View File

@ -4,6 +4,8 @@
#include "Emu/Memory/vm.h"
#include "Crypto/sha1.h"
#include "Utilities/StrUtil.h"
#include "Utilities/JIT.h"
#include "Utilities/sysinfo.h"
#include "SPUThread.h"
#include "SPUAnalyser.h"
@ -24,39 +26,154 @@ const spu_decoder<spu_iflag> s_spu_iflag;
extern u64 get_timebased_time();
// Move 4 args for calling native function from a GHC calling convention function
static u8* move_args_ghc_to_native(u8* raw)
{
#ifdef _WIN32
// mov rcx, r13
// mov rdx, rbp
// mov r8, r12
// mov r9, rbx
std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12);
#else
// mov rdi, r13
// mov rsi, rbp
// mov rdx, r12
// mov rcx, rbx
std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12);
#endif
return raw + 12;
}
DECLARE(spu_runtime::tr_dispatch) = []
{
// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
u8* const trptr = jit_runtime::alloc(16, 16);
trptr[0] = 0xf3; // pause
trptr[1] = 0x90;
trptr[2] = 0xff; // jmp [rip]
trptr[3] = 0x25;
std::memset(trptr + 4, 0, 4);
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xf3; // pause
*raw++ = 0x90;
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
std::memcpy(trptr + 8, &target, 8);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
}();
DECLARE(spu_runtime::tr_branch) = []
{
// Generate a trampoline to spu_recompiler_base::branch
u8* const trptr = jit_runtime::alloc(16, 16);
trptr[0] = 0xff; // jmp [rip]
trptr[1] = 0x25;
std::memset(trptr + 2, 0, 4);
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
std::memcpy(trptr + 6, &target, 8);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
}();
DECLARE(spu_runtime::g_dispatcher) = []
{
const auto ptr = reinterpret_cast<decltype(spu_runtime::g_dispatcher)>(jit_runtime::alloc(sizeof(spu_function_t), 8, false));
ptr->raw() = &spu_recompiler_base::dispatch;
ptr->raw() = tr_dispatch;
return ptr;
}();
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rsi);
c.push(x86::rdi);
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 0xa8);
c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15);
c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7);
c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
#else
c.push(x86::rbp);
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::rax);
#endif
// Load g_dispatcher pointer to call g_dispatcher[0]
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher));
c.mov(x86::rax, x86::qword_ptr(x86::rax));
// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
// Move 4 args (despite spu_function_t def)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);
if (utils::has_avx())
{
c.vzeroupper();
}
c.call(x86::rax);
if (utils::has_avx())
{
c.vzeroupper();
}
#ifdef _WIN32
c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10));
c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20));
c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30));
c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40));
c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50));
c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60));
c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70));
c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80));
c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90));
c.add(x86::rsp, 0xa8);
c.pop(x86::rbx);
c.pop(x86::rbp);
c.pop(x86::rdi);
c.pop(x86::rsi);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
#else
c.add(x86::rsp, +8);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
c.pop(x86::rbp);
#endif
c.ret();
});
DECLARE(spu_runtime::g_interpreter) = nullptr;
spu_cache::spu_cache(const std::string& loc)
@ -347,7 +464,7 @@ bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const st
spu_runtime::spu_runtime()
{
// Initialize "empty" block
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
m_map[std::vector<u32>()] = tr_dispatch;
// Clear LLVM output
m_cache_path = Emu.PPUCache();
@ -412,7 +529,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
else
{
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 11, 16);
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16);
if (!wxptr)
{
@ -425,7 +542,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
// Write jump instruction with rel32 immediate
auto make_jump = [&](u8 op, auto target)
{
verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22;
verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22 + 16;
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
@ -460,26 +577,18 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
workload.back().beg = beg;
workload.back().end = _end;
// mov eax, [spu_thread::pc]
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
#ifdef _WIN32
*raw++ = 0x81;
#else
*raw++ = 0x87;
#endif
const u32 pc_off = ::offset32(&spu_thread::pc);
std::memcpy(raw, &pc_off, 4);
raw += 4;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// lea r9, [ls + rax]
*raw++ = 0x4c;
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x0c;
#ifdef _WIN32
*raw++ = 0x02;
#else
*raw++ = 0x06;
#endif
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
for (std::size_t i = 0; i < workload.size(); i++)
{
@ -580,17 +689,26 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
}
// Emit 32-bit comparison
verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22;
verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22 + 16;
if (w.from != w.level)
{
// If necessary (level has advanced), emit load: mov eax, [r9 + addr]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x81;
// If necessary (level has advanced), emit load: mov eax, [rcx + addr]
const u32 cmp_lsa = w.level * 4u;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
if (cmp_lsa < 0x80)
{
*raw++ = 0x8b;
*raw++ = 0x41;
*raw++ = ::narrow<s8>(cmp_lsa);
}
else
{
*raw++ = 0x8b;
*raw++ = 0x81;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
}
}
// Emit comparison: cmp eax, imm32
@ -807,20 +925,15 @@ spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
return nullptr;
}
// Save address of the following jmp
#ifdef _WIN32
raw[0] = 0x4c; // lea r8, [rip+1]
// Save address of the following jmp (GHC CC 3rd argument)
raw[0] = 0x4c; // lea r12, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x05;
#else
raw[0] = 0x48; // lea rdx, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x15;
#endif
raw[2] = 0x25;
raw[3] = 0x01;
raw[4] = 0x00;
raw[5] = 0x00;
raw[6] = 0x00;
raw[7] = 0x90; // nop
// Jump to spu_recompiler_base::branch
@ -4003,6 +4116,7 @@ public:
// Add entry function (contains only state/code check)
const auto main_func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(hash, get_ftype<void, u8*, u8*, u8*>()).getCallee());
const auto main_arg2 = &*(main_func->arg_begin() + 2);
main_func->setCallingConv(CallingConv::GHC);
set_function(main_func);
// Start compilation
@ -4130,17 +4244,10 @@ public:
const auto pbcount = spu_ptr<u64>(&spu_thread::block_counter);
m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount);
const auto gateway = llvm::cast<Function>(m_module->getOrInsertFunction("spu_chunk_gateway", get_ftype<void, u8*, u8*, u32>()).getCallee());
gateway->setLinkage(GlobalValue::InternalLinkage);
gateway->setCallingConv(CallingConv::GHC);
// Call the entry function chunk
const auto entry_chunk = add_function(m_pos);
tail_chunk(entry_chunk->chunk);
// Save host thread's stack pointer
const auto native_sp = spu_ptr<u64>(&spu_thread::saved_native_sp);
const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
m_ir->CreateStore(m_ir->CreateCall(get_intrinsic<u64>(Intrinsic::read_register), {rsp_name}), native_sp);
m_ir->CreateCall(gateway, {m_thread, m_lsptr, m_base_pc})->setCallingConv(gateway->getCallingConv());
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(label_stop);
m_ir->CreateRetVoid();
@ -4150,7 +4257,9 @@ public:
{
const auto pbfail = spu_ptr<u64>(&spu_thread::block_failure);
m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail);
call("spu_dispatch", &spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), main_arg2)->setTailCall();
const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_ir->getInt32(0), main_arg2);
dispci->setCallingConv(CallingConv::GHC);
dispci->setTailCall();
m_ir->CreateRetVoid();
}
else
@ -4158,17 +4267,12 @@ public:
m_ir->CreateUnreachable();
}
set_function(gateway);
// Call the entry function chunk
const auto entry_chunk = add_function(m_pos);
tail_chunk(entry_chunk->chunk);
// Longjmp analogue (restore saved host thread's stack pointer)
const auto escape = llvm::cast<llvm::Function>(m_module->getOrInsertFunction("spu_escape", get_ftype<void, u8*>()).getCallee());
escape->setLinkage(GlobalValue::InternalLinkage);
m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", escape));
const auto load_sp = m_ir->CreateLoad(_ptr<u64>(&*escape->arg_begin(), ::offset32(&spu_thread::saved_native_sp)));
const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
m_ir->CreateCall(get_intrinsic<u64>(Intrinsic::write_register), {rsp_name, m_ir->CreateSub(load_sp, m_ir->getInt64(8))});
m_ir->CreateRetVoid();

View File

@ -59,6 +59,8 @@ class spu_runtime
// Debug module output location
std::string m_cache_path;
public:
// Trampoline to spu_recompiler_base::dispatch
static const spu_function_t tr_dispatch;
@ -100,6 +102,9 @@ public:
// All dispatchers (array allocated in jit memory)
static atomic_t<spu_function_t>* const g_dispatcher;
// Recompiler entry point
static const spu_function_t g_gateway;
// Interpreter entry point
static spu_function_t g_interpreter;

View File

@ -832,7 +832,7 @@ void spu_thread::cpu_task()
}
}
spu_runtime::g_dispatcher[0](*this, vm::_ptr<u8>(offset), nullptr);
spu_runtime::g_gateway(*this, vm::_ptr<u8>(offset), nullptr);
}
// Print some stats