diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 2217518cce..d1515849a4 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -67,24 +67,11 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector(func.data() + 1); - - if (g_cfg.core.spu_block_size != spu_block_size_type::giga) - { - dis_asm.offset -= func[0]; - } - StringLogger logger; logger.addOptions(Logger::kOptionBinaryForm); std::string log; - if (g_cfg.core.spu_debug) - { - fmt::append(log, "========== SPU BLOCK 0x%05x (size %u) ==========\n\n", func[0], func.size() - 1); - } - CodeHolder code; code.init(m_asmrt.getCodeInfo()); code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign; @@ -94,29 +81,35 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectordump(log); + fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); + // Set logger code.setLogger(&logger); } - // Initialize variables -#ifdef _WIN32 - this->cpu = &x86::rcx; - this->ls = &x86::rdx; -#else - this->cpu = &x86::rdi; - this->ls = &x86::rsi; -#endif + // Initialize args + this->cpu = &x86::r13; + this->ls = &x86::rbp; + this->rip = &x86::r12; + this->pc0 = &x86::r15; this->addr = &x86::eax; + #ifdef _WIN32 + this->arg0 = &x86::rcx; + this->arg1 = &x86::rdx; this->qw0 = &x86::r8; this->qw1 = &x86::r9; #else + this->arg0 = &x86::rdi; + this->arg1 = &x86::rsi; this->qw0 = &x86::rdx; this->qw1 = &x86::rcx; #endif - const std::array vec_vars + const std::array vec_vars { &x86::xmm0, &x86::xmm1, @@ -124,6 +117,16 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectornewLabel(); } } - // Set PC and check status - c->mov(SPU_OFF_32(pc), m_pos); + // Load actual PC and check status + c->push(x86::rax); + c->mov(pc0->r32(), SPU_OFF_32(pc)); c->cmp(SPU_OFF_32(state), 0); c->jnz(label_stop); @@ -183,6 +188,36 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorlea(x86::rax, get_pc(starta)); + c->and_(x86::eax, 0x3fffc); + return x86::qword_ptr(*ls, x86::rax); + } + else + { + return x86::qword_ptr(*ls, *pc0); + } + }; + if (!g_cfg.core.spu_verification) { // Disable check (unsafe) @@ -191,9 +226,10 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorvzeroupper(); } } - else if (m_size == 4) + else if (m_size == 8 && (g_cfg.core.spu_block_size != spu_block_size_type::giga || func[0] != 4)) { - c->cmp(x86::dword_ptr(*ls, start), func[1]); + c->mov(x86::rax, static_cast(func[2]) << 32 | func[1]); + c->cmp(x86::rax, x86::qword_ptr(*ls, *pc0)); c->jnz(label_diff); if (utils::has_avx()) @@ -201,10 +237,9 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorvzeroupper(); } } - else if (m_size == 8) + else if (m_size == 8 || m_size == 4) { - c->mov(*qw1, static_cast(func[2]) << 32 | func[1]); - c->cmp(*qw1, x86::qword_ptr(*ls, start)); + c->cmp(x86::dword_ptr(*ls, *pc0), +func.back()); c->jnz(label_diff); if (utils::has_avx()) @@ -293,7 +328,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorjnz(label_diff); c->vzeroupper(); } - else if (utils::has_512()) + else if (0 && utils::has_512()) { // AVX-512 optimized check using 256-bit registers words_align = 32; @@ -415,7 +450,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorvzeroupper(); } - else if (utils::has_avx()) + else if (0 && utils::has_avx()) { // Mainstream AVX words_align = 32; @@ -566,20 +601,15 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectoradd(*ls, starta); + c->lea(x86::rcx, get_pc_ptr()); c->lea(x86::rax, x86::qword_ptr(label_code)); u32 code_off = 0; u32 ls_off = starta; u32 order0 = 0; u32 order1 = 0; - for (u32 j = starta; j < enda; j += 16) + for (u32 j = starta; j < end; j += 16) { const u32 cmask = get_code_mask(j, j + 16); @@ -596,12 +626,12 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector= 256) { - c->add(*ls, j - ls_off); + c->add(x86::rcx, j - ls_off); ls_off = j; } else if (j - ls_off >= 128) { - c->sub(*ls, -128); + c->sub(x86::rcx, -128); ls_off += 128; } @@ -643,14 +673,22 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorpshufd(dest, x86::dqword_ptr(*ls, j - ls_off), s_pshufd_imm[cmask]); + if (utils::has_avx()) + { + c->vpshufd(dest, x86::dqword_ptr(x86::rcx, j - ls_off), s_pshufd_imm[cmask]); + } + else + { + c->movups(dest, x86::dqword_ptr(x86::rcx, j - ls_off)); + c->pshufd(dest, dest, s_pshufd_imm[cmask]); + } } else { - c->movaps(dest, x86::dqword_ptr(*ls, j - ls_off)); + c->movups(dest, x86::dqword_ptr(x86::rcx, j - ls_off)); } // Perform bitwise comparison and accumulate @@ -669,8 +707,6 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectororps(x86::xmm0, x86::xmm3); } - c->sub(*ls, ls_off); - if (utils::has_sse41()) { c->ptest(x86::xmm0, x86::xmm0); @@ -700,28 +736,10 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector::swap(func[i]); - if (g_cfg.core.spu_debug) - { - // Disasm - dis_asm.dump_pc = pos; - dis_asm.disasm(pos); - - if (op) - { - log += '>'; - log += dis_asm.last_opcode; - log += '\n'; - } - else - { - fmt::append(log, ">[%08x] xx xx xx xx: \n", pos); - } - } - if (!op) { // Ignore hole - if (m_pos != -1) + if (m_pos + 1) { LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x", pos); branch_fixed(spu_branch_target(pos)); @@ -749,10 +767,13 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorlea(x86::r14, get_pc(m_pos)); + // Execute recompiler function (this->*s_spu_decoder.decode(op))({op}); @@ -763,14 +784,8 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectordump(log); - } - // Make fallthrough if necessary - if (m_pos != -1) + if (m_pos + 1) { branch_fixed(spu_branch_target(end)); } @@ -778,6 +793,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectoralign(kAlignCode, 16); c->bind(label_stop); + c->pop(x86::rax); c->ret(); if (g_cfg.core.spu_verification) @@ -786,7 +802,8 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectoralign(kAlignCode, 16); c->bind(label_diff); c->inc(SPU_OFF_64(block_failure)); - c->jmp(imm_ptr(&spu_recompiler_base::dispatch)); + c->pop(x86::rax); + c->jmp(imm_ptr(spu_runtime::tr_dispatch)); } for (auto&& work : decltype(after)(std::move(after))) @@ -860,7 +877,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorget_cache_path() + "spu.log", fs::write + fs::append).write(log); + fs::file(m_spurt->get_cache_path() + "spu-ir.log", fs::write + fs::append).write(log); } return fn; @@ -922,19 +939,17 @@ inline asmjit::X86Mem spu_recompiler::XmmConst(__m128i data) return XmmConst(v128::fromV(data)); } -static void check_state_ret(spu_thread& _spu, void*, u8*) +inline asmjit::X86Mem spu_recompiler::get_pc(u32 addr) { - // MSVC workaround (TCO) + return asmjit::x86::qword_ptr(*pc0, addr - m_base); } -static void check_state(spu_thread* _spu, spu_function_t _ret) +static void check_state(spu_thread* _spu) { if (_spu->state && _spu->check_state()) { - _ret = &check_state_ret; + spu_runtime::g_escape(_spu); } - - _ret(*_spu, _spu->_ptr(0), nullptr); } void spu_recompiler::branch_fixed(u32 target) @@ -948,26 +963,30 @@ void spu_recompiler::branch_fixed(u32 target) { c->cmp(SPU_OFF_32(state), 0); c->jz(local->second); - c->mov(SPU_OFF_32(pc), target); - c->lea(*ls, x86::qword_ptr(local->second)); - c->jmp(imm_ptr(&check_state)); + c->lea(addr->r64(), get_pc(target)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(*arg0, *cpu); + c->call(imm_ptr(&check_state)); + c->jmp(local->second); return; } - const auto ppptr = m_spurt->make_branch_patchpoint(); + const auto ppptr = g_cfg.core.spu_block_size == spu_block_size_type::giga || !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint(); - c->mov(SPU_OFF_32(pc), target); - c->xor_(qw0->r32(), qw0->r32()); + c->lea(addr->r64(), get_pc(target)); + c->mov(SPU_OFF_32(pc), *addr); + c->xor_(rip->r32(), rip->r32()); c->cmp(SPU_OFF_32(state), 0); c->jnz(label_stop); if (ppptr) { + c->pop(x86::rax); c->jmp(imm_ptr(ppptr)); } else { - c->ret(); + c->jmp(label_stop); } } @@ -976,7 +995,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) using namespace asmjit; // Initialize third arg to zero - c->xor_(qw0->r32(), qw0->r32()); + c->xor_(rip->r32(), rip->r32()); if (op.d) { @@ -986,7 +1005,9 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) { auto _throw = [](spu_thread* _spu) { - fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x)" HERE, +_spu->ch_event_mask); + _spu->state += cpu_flag::dbg_pause; + LOG_FATAL(SPU, "SPU Interrupts not implemented (mask=0x%x)", +_spu->ch_event_mask); + spu_runtime::g_escape(_spu); }; Label no_intr = c->newLabel(); @@ -1003,6 +1024,8 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) c->jmp(no_intr); c->bind(fail); c->mov(SPU_OFF_32(pc), *addr); + c->mov(*arg0, *cpu); + c->pop(x86::rax); c->jmp(imm_ptr(_throw)); // Save addr in srr0 and disable interrupts @@ -1016,19 +1039,17 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) c->xor_(*addr, 0x30); c->bswap(*addr); c->test(*addr, 0xff80007f); - c->cmovnz(*addr, qw0->r32()); + c->cmovnz(*addr, rip->r32()); c->shr(*addr, 5); c->align(kAlignCode, 16); c->bind(no_intr); } - if (!jt && g_cfg.core.spu_block_size != spu_block_size_type::giga) - { - // Simply external call (return or indirect call) - c->mov(x86::r10, imm_ptr(spu_runtime::g_dispatcher)); - c->mov(x86::r10, x86::qword_ptr(x86::r10)); - } - else + c->mov(SPU_OFF_32(pc), *addr); + c->cmp(SPU_OFF_32(state), 0); + c->jnz(label_stop); + + if (jt || g_cfg.core.spu_block_size == spu_block_size_type::giga) { if (!instr_table.isValid()) { @@ -1040,34 +1061,44 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) const u32 start = instr_labels.begin()->first; const u32 end = instr_labels.rbegin()->first + 4; - // Load indirect jump address, choose between local and external - c->lea(*qw1, x86::qword_ptr(addr->r64(), 0 - start)); - c->lea(x86::r10, x86::qword_ptr(instr_table)); + // Load local indirect jump address, check local bounds + Label fail = c->newLabel(); + c->lea(*qw1, get_pc(start)); + c->neg(*qw1); + c->lea(*qw1, x86::qword_ptr(addr->r64(), *qw1)); + c->lea(addr->r64(), x86::qword_ptr(instr_table)); c->cmp(qw1->r32(), end - start); - c->lea(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); - c->mov(*qw1, imm_ptr(spu_runtime::g_dispatcher)); - c->cmovae(x86::r10, *qw1); - c->mov(x86::r10, x86::qword_ptr(x86::r10)); + c->jae(fail); + c->jmp(x86::qword_ptr(addr->r64(), *qw1, 1, 0)); + c->bind(fail); } if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret) { // Get stack pointer, try to use native return address (check SPU return address) + Label fail = c->newLabel(); c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3)); c->and_(qw1->r32(), 0x3fff0); c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror))); c->cmp(x86::dword_ptr(*qw1, 8), *addr); - c->cmove(x86::r10, x86::qword_ptr(*qw1)); + c->jne(fail); + c->mov(pc0->r32(), x86::dword_ptr(*qw1, 12)); + c->jmp(x86::qword_ptr(*qw1)); + c->bind(fail); } - Label label_check = c->newLabel(); - c->mov(SPU_OFF_32(pc), *addr); - c->cmp(SPU_OFF_32(state), 0); - c->jnz(label_check); - c->jmp(x86::r10); - c->bind(label_check); - c->mov(*ls, x86::r10); - c->jmp(imm_ptr(&check_state)); + // Simply external call (return or indirect call) + const auto ppptr = g_cfg.core.spu_block_size == spu_block_size_type::giga || !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint(); + + if (ppptr) + { + c->pop(x86::rax); + c->jmp(imm_ptr(ppptr)); + } + else + { + c->jmp(label_stop); + } } void spu_recompiler::branch_set_link(u32 target) @@ -1089,7 +1120,9 @@ void spu_recompiler::branch_set_link(u32 target) c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror))); c->lea(x86::r10, x86::qword_ptr(ret)); c->mov(x86::qword_ptr(*qw1, 0), x86::r10); - c->mov(x86::qword_ptr(*qw1, 8), target); + c->lea(x86::r10, get_pc(target)); + c->mov(x86::dword_ptr(*qw1, 8), x86::r10d); + c->mov(x86::dword_ptr(*qw1, 12), pc0->r32()); after.emplace_back([=, target = local->second] { @@ -1108,42 +1141,22 @@ void spu_recompiler::branch_set_link(u32 target) void spu_recompiler::fall(spu_opcode_t op) { - auto gate = [](spu_thread* _spu, u32 opcode, spu_inter_func_t _func, spu_function_t _ret) + auto gate = [](spu_thread* _spu, u32 opcode, spu_inter_func_t _func) { if (!_func(*_spu, {opcode})) { - // Workaround for MSVC (TCO) - fmt::raw_error("spu_recompiler::fall(): unexpected interpreter call"); + _spu->state += cpu_flag::dbg_pause; + LOG_FATAL(SPU, "spu_recompiler::fall(): unexpected interpreter call (op=0x%08x)", opcode); + spu_runtime::g_escape(_spu); } - - // Restore arguments and return to the next instruction - _ret(*_spu, _spu->_ptr(0), nullptr); }; - asmjit::Label next = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(*ls, op.opcode); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.opcode); c->mov(*qw0, asmjit::imm_ptr(asmjit::Internal::ptr_cast(g_spu_interpreter_fast.decode(op.opcode)))); - c->lea(*qw1, asmjit::x86::qword_ptr(next)); - c->jmp(asmjit::imm_ptr(gate)); - c->align(asmjit::kAlignCode, 16); - c->bind(next); -} - -void spu_recompiler::save_rcx() -{ -#ifdef _WIN32 - c->mov(asmjit::x86::r11, *cpu); - cpu = &asmjit::x86::r11; -#endif -} - -void spu_recompiler::load_rcx() -{ -#ifdef _WIN32 - cpu = &asmjit::x86::rcx; - c->mov(*cpu, asmjit::x86::r11); -#endif + c->mov(*arg0, *cpu); + c->call(asmjit::imm_ptr(gate)); } void spu_recompiler::get_events() @@ -1230,20 +1243,18 @@ void spu_recompiler::get_events() // Check decrementer event (unlikely) after.emplace_back([=] { - auto sub = [](spu_thread* _spu, spu_function_t _ret) + auto sub = [](spu_thread* _spu) { if ((_spu->ch_dec_value - (get_timebased_time() - _spu->ch_dec_start_timestamp)) >> 31) { _spu->ch_event_stat |= SPU_EVENT_TM; } - - // Restore args and return - _ret(*_spu, _spu->_ptr(0), nullptr); }; c->bind(tcheck); - c->lea(*ls, x86::qword_ptr(label2)); - c->jmp(imm_ptr(sub)); + c->mov(*arg0, *cpu); + c->call(imm_ptr(sub)); + c->jmp(label2); }); // Check whether SPU_EVENT_TM is already set @@ -1264,10 +1275,14 @@ void spu_recompiler::get_events() { auto _throw = [](spu_thread* _spu) { - fmt::throw_exception("SPU Events not implemented (mask=0x%x)" HERE, +_spu->ch_event_mask); + _spu->state += cpu_flag::dbg_pause; + LOG_FATAL(SPU, "SPU Events not implemented (mask=0x%x).", +_spu->ch_event_mask); + spu_runtime::g_escape(_spu); }; c->bind(fail); + c->mov(*arg0, *cpu); + c->pop(x86::rax); c->jmp(imm_ptr(_throw)); }); @@ -1284,23 +1299,26 @@ void spu_recompiler::UNK(spu_opcode_t op) { auto gate = [](spu_thread* _spu, u32 op) { - fmt::throw_exception("Unknown/Illegal instruction (0x%08x)" HERE, op); + _spu->state += cpu_flag::dbg_pause; + LOG_FATAL(SPU, "Unknown/Illegal instruction (0x%08x)" HERE, op); + spu_runtime::g_escape(_spu); }; - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(*ls, op.opcode); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.opcode); + c->mov(*arg0, *cpu); + c->pop(asmjit::x86::rax); c->jmp(asmjit::imm_ptr(gate)); m_pos = -1; } -void spu_stop(spu_thread* _spu, u32 code, spu_function_t _ret) +void spu_stop(spu_thread* _spu, u32 code) { if (!_spu->stop_and_signal(code)) { - _ret = check_state_ret; + spu_runtime::g_escape(_spu); } - - _ret(*_spu, _spu->_ptr(0), nullptr); } void spu_recompiler::STOP(spu_opcode_t op) @@ -1308,31 +1326,28 @@ void spu_recompiler::STOP(spu_opcode_t op) using namespace asmjit; Label ret = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(*ls, op.opcode & 0x3fff); - c->lea(*qw0, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_stop)); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.opcode & 0x3fff); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_stop)); c->align(kAlignCode, 16); c->bind(ret); - c->mov(SPU_OFF_32(pc), m_pos + 4); + c->add(SPU_OFF_32(pc), 4); if (g_cfg.core.spu_block_size == spu_block_size_type::safe) { - c->ret(); + c->jmp(label_stop); m_pos = -1; } else { Label label_next = c->newLabel(); - Label label_check = c->newLabel(); c->cmp(SPU_OFF_32(state), 0); - c->jnz(label_check); - c->jmp(label_next); - c->bind(label_check); - c->lea(*ls, x86::qword_ptr(label_next)); - c->jmp(imm_ptr(&check_state)); - c->align(kAlignCode, 16); + c->jz(label_next); + c->mov(*arg0, *cpu); + c->call(imm_ptr(&check_state)); c->bind(label_next); } } @@ -1348,8 +1363,9 @@ void spu_recompiler::SYNC(spu_opcode_t op) if (g_cfg.core.spu_block_size == spu_block_size_type::safe) { - c->mov(SPU_OFF_32(pc), m_pos + 4); - c->ret(); + c->lea(addr->r64(), get_pc(m_pos + 4)); + c->mov(SPU_OFF_32(pc), *addr); + c->jmp(label_stop); m_pos = -1; } } @@ -1368,22 +1384,16 @@ void spu_recompiler::MFSPR(spu_opcode_t op) c->movdqa(SPU_OFF_128(gpr, op.rt), vr); } -static void spu_rdch_ret(spu_thread& spu, void*, u32) -{ - // MSVC workaround (TCO) -} - -static void spu_rdch(spu_thread* _spu, u32 ch, void(*_ret)(spu_thread&, void*, u32)) +static s64 spu_rdch(spu_thread* _spu, u32 ch) { const s64 result = _spu->get_ch_value(ch); if (result < 0) { - _ret = &spu_rdch_ret; + spu_runtime::g_escape(_spu); } - // Return channel value in the third argument - _ret(*_spu, _spu->_ptr(0), static_cast(result)); + return result; } void spu_recompiler::RDCH(spu_opcode_t op) @@ -1405,10 +1415,12 @@ void spu_recompiler::RDCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(wait); - c->mov(SPU_OFF_32(pc), pos); - c->mov(ls->r32(), op.ra); - c->lea(*qw0, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_rdch)); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_rdch)); + c->jmp(ret); }); if (sync) @@ -1423,9 +1435,8 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->mov(channel_ptr, *qw0); } - c->mov(qw0->r32(), *addr); c->bind(ret); - c->movd(x86::xmm0, qw0->r32()); + c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); }; @@ -1482,7 +1493,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) { LOG_WARNING(SPU, "[0x%x] RDCH: RdDec", m_pos); - auto sub1 = [](spu_thread* _spu, v128* _res, spu_function_t _ret) + auto sub1 = [](spu_thread* _spu, v128* _res) { const u32 out = _spu->ch_dec_value - static_cast(get_timebased_time() - _spu->ch_dec_start_timestamp); @@ -1490,26 +1501,21 @@ void spu_recompiler::RDCH(spu_opcode_t op) std::this_thread::yield(); *_res = v128::from32r(out); - _ret(*_spu, _spu->_ptr(0), nullptr); }; - auto sub2 = [](spu_thread* _spu, v128* _res, spu_function_t _ret) + auto sub2 = [](spu_thread* _spu, v128* _res) { const u32 out = _spu->ch_dec_value - static_cast(get_timebased_time() - _spu->ch_dec_start_timestamp); *_res = v128::from32r(out); - _ret(*_spu, _spu->_ptr(0), nullptr); }; - using ftype = void (*)(spu_thread*, v128*, spu_function_t); - - asmjit::Label next = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->lea(*ls, SPU_OFF_128(gpr, op.rt)); - c->lea(*qw0, asmjit::x86::qword_ptr(next)); - c->jmp(g_cfg.core.spu_loop_detection ? asmjit::imm_ptr(sub1) : asmjit::imm_ptr(sub2)); - c->align(asmjit::kAlignCode, 16); - c->bind(next); + using ftype = void (*)(spu_thread*, v128*); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->lea(*arg1, SPU_OFF_128(gpr, op.rt)); + c->mov(*arg0, *cpu); + c->call(g_cfg.core.spu_loop_detection ? asmjit::imm_ptr(sub1) : asmjit::imm_ptr(sub2)); return; } case SPU_RdEventMask: @@ -1531,15 +1537,16 @@ void spu_recompiler::RDCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(wait); - c->mov(SPU_OFF_32(pc), pos); - c->mov(ls->r32(), op.ra); - c->lea(*qw0, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_rdch)); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_rdch)); + c->jmp(ret); }); - c->mov(qw0->r32(), *addr); c->bind(ret); - c->movd(x86::xmm0, qw0->r32()); + c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); return; @@ -1555,22 +1562,19 @@ void spu_recompiler::RDCH(spu_opcode_t op) } } - Label ret = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(ls->r32(), op.ra); - c->lea(*qw0, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_rdch)); - c->bind(ret); - c->movd(x86::xmm0, qw0->r32()); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_rdch)); + c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); } -static void spu_rchcnt(spu_thread* _spu, u32 ch, void(*_ret)(spu_thread&, void*, u32 res)) +static u32 spu_rchcnt(spu_thread* _spu, u32 ch) { - // Put result into the third argument - const u32 res = _spu->get_ch_count(ch); - _ret(*_spu, _spu->_ptr(0), res); + return _spu->get_ch_count(ch); } void spu_recompiler::RCHCNT(spu_opcode_t op) @@ -1637,24 +1641,23 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) { LOG_WARNING(SPU, "[0x%x] RCHCNT: RdEventStat", m_pos); get_events(); - c->setnz(qw0->r8()); - c->movzx(qw0->r32(), qw0->r8()); + c->setnz(addr->r8()); + c->movzx(*addr, addr->r8()); break; } default: { - Label ret = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(*ls, op.ra); - c->lea(*qw0, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_rchcnt)); - c->bind(ret); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_rchcnt)); break; } } // Use result from the third argument - c->movd(x86::xmm0, qw0->r32()); + c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); } @@ -1773,8 +1776,6 @@ void spu_recompiler::ROT(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 4; i++) // unrolled loop { c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); @@ -1782,8 +1783,6 @@ void spu_recompiler::ROT(spu_opcode_t op) c->rol(qw0->r32(), asmjit::x86::cl); c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); } - - load_rcx(); } void spu_recompiler::ROTM(spu_opcode_t op) @@ -1816,8 +1815,6 @@ void spu_recompiler::ROTM(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 4; i++) // unrolled loop { c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); @@ -1826,8 +1823,6 @@ void spu_recompiler::ROTM(spu_opcode_t op) c->shr(*qw0, asmjit::x86::cl); c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); } - - load_rcx(); } void spu_recompiler::ROTMA(spu_opcode_t op) @@ -1859,8 +1854,6 @@ void spu_recompiler::ROTMA(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 4; i++) // unrolled loop { c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); @@ -1869,8 +1862,6 @@ void spu_recompiler::ROTMA(spu_opcode_t op) c->sar(*qw0, asmjit::x86::cl); c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); } - - load_rcx(); } void spu_recompiler::SHL(spu_opcode_t op) @@ -1899,8 +1890,6 @@ void spu_recompiler::SHL(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 4; i++) // unrolled loop { c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); @@ -1908,8 +1897,6 @@ void spu_recompiler::SHL(spu_opcode_t op) c->shl(*qw0, asmjit::x86::cl); c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); } - - load_rcx(); } void spu_recompiler::ROTH(spu_opcode_t op) //nf @@ -1942,8 +1929,6 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf return; } - save_rcx(); - for (u32 i = 0; i < 8; i++) // unrolled loop { c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); @@ -1951,8 +1936,6 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf c->rol(qw0->r16(), asmjit::x86::cl); c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); } - - load_rcx(); } void spu_recompiler::ROTHM(spu_opcode_t op) @@ -2005,8 +1988,6 @@ void spu_recompiler::ROTHM(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 8; i++) // unrolled loop { c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); @@ -2015,8 +1996,6 @@ void spu_recompiler::ROTHM(spu_opcode_t op) c->shr(qw0->r32(), asmjit::x86::cl); c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); } - - load_rcx(); } void spu_recompiler::ROTMAH(spu_opcode_t op) @@ -2070,8 +2049,6 @@ void spu_recompiler::ROTMAH(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 8; i++) // unrolled loop { c->movsx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); @@ -2080,8 +2057,6 @@ void spu_recompiler::ROTMAH(spu_opcode_t op) c->sar(qw0->r32(), asmjit::x86::cl); c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); } - - load_rcx(); } void spu_recompiler::SHLH(spu_opcode_t op) @@ -2129,8 +2104,6 @@ void spu_recompiler::SHLH(spu_opcode_t op) return; } - save_rcx(); - for (u32 i = 0; i < 8; i++) // unrolled loop { c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); @@ -2138,8 +2111,6 @@ void spu_recompiler::SHLH(spu_opcode_t op) c->shl(qw0->r32(), asmjit::x86::cl); c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); } - - load_rcx(); } void spu_recompiler::ROTI(spu_opcode_t op) @@ -2175,7 +2146,7 @@ void spu_recompiler::ROTI(spu_opcode_t op) void spu_recompiler::ROTMI(spu_opcode_t op) { // shift right logical - const int s = 0-op.i7 & 0x3f; + const int s = (0 - op.i7) & 0x3f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrld(va, s); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -2184,7 +2155,7 @@ void spu_recompiler::ROTMI(spu_opcode_t op) void spu_recompiler::ROTMAI(spu_opcode_t op) { // shift right arithmetical - const int s = 0-op.i7 & 0x3f; + const int s = (0 - op.i7) & 0x3f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrad(va, s); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -2215,7 +2186,7 @@ void spu_recompiler::ROTHI(spu_opcode_t op) void spu_recompiler::ROTHMI(spu_opcode_t op) { // shift right logical - const int s = 0-op.i7 & 0x1f; + const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrlw(va, s); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -2224,7 +2195,7 @@ void spu_recompiler::ROTHMI(spu_opcode_t op) void spu_recompiler::ROTMAHI(spu_opcode_t op) { // shift right arithmetical (halfword) - const int s = 0-op.i7 & 0x1f; + const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psraw(va, s); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -2314,29 +2285,20 @@ void spu_recompiler::MTSPR(spu_opcode_t op) // Check SPUInterpreter for notes. } -static void spu_wrch_ret(spu_thread& _spu, void*, u8*) -{ - // MSVC workaround (TCO) -} - -static void spu_wrch(spu_thread* _spu, u32 ch, u32 value, spu_function_t _ret) +static void spu_wrch(spu_thread* _spu, u32 ch, u32 value) { if (!_spu->set_ch_value(ch, value)) { - _ret = &spu_wrch_ret; + spu_runtime::g_escape(_spu); } - - _ret(*_spu, _spu->_ptr(0), nullptr); } static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret) { if (!_spu->process_mfc_cmd()) { - _ret = &spu_wrch_ret; + spu_runtime::g_escape(_spu); } - - _ret(*_spu, _spu->_ptr(0), nullptr); } void spu_recompiler::WRCH(spu_opcode_t op) @@ -2372,10 +2334,12 @@ void spu_recompiler::WRCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(wait); - c->mov(SPU_OFF_32(pc), pos); - c->mov(ls->r32(), op.ra); - c->lea(*qw1, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_wrch)); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_wrch)); + c->jmp(ret); }); c->bts(*qw0, spu_channel::off_count); @@ -2396,10 +2360,12 @@ void spu_recompiler::WRCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(upd); - c->mov(SPU_OFF_32(pc), pos); - c->lea(ls->r32(), MFC_WrTagMask); - c->lea(*qw1, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_wrch)); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->lea(arg1->r32(), MFC_WrTagMask); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_wrch)); + c->jmp(ret); }); c->bind(ret); @@ -2417,10 +2383,12 @@ void spu_recompiler::WRCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(fail); - c->mov(SPU_OFF_32(pc), pos); - c->mov(ls->r32(), op.ra); - c->lea(*qw1, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_wrch)); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_wrch)); + c->jmp(ret); c->bind(zero); c->mov(SPU_OFF_32(ch_tag_upd), qw0->r32()); @@ -2480,20 +2448,17 @@ void spu_recompiler::WRCH(spu_opcode_t op) } case MFC_Cmd: { - // TODO - Label ret = c->newLabel(); c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::cmd), addr->r8()); - c->mov(SPU_OFF_32(pc), m_pos); - c->lea(*ls, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_wrch_mfc)); - c->align(kAlignCode, 16); - c->bind(ret); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_wrch_mfc)); return; } case MFC_WrListStallAck: { - auto sub = [](spu_thread* _spu, spu_function_t _ret, u32 tag) + auto sub = [](spu_thread* _spu, u32 tag) { for (u32 i = 0; i < _spu->mfc_size; i++) { @@ -2505,33 +2470,27 @@ void spu_recompiler::WRCH(spu_opcode_t op) } _spu->do_mfc(true); - _ret(*_spu, _spu->_ptr(0), nullptr); }; Label ret = c->newLabel(); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->and_(qw0->r32(), 0x1f); - c->btr(SPU_OFF_32(ch_stall_mask), qw0->r32()); + c->mov(arg1->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->and_(arg1->r32(), 0x1f); + c->btr(SPU_OFF_32(ch_stall_mask), arg1->r32()); c->jnc(ret); - c->lea(*ls, x86::qword_ptr(ret)); - c->jmp(imm_ptr(sub)); - c->align(kAlignCode, 16); + c->mov(*arg0, *cpu); + c->call(imm_ptr(sub)); c->bind(ret); return; } case SPU_WrDec: { - auto sub = [](spu_thread* _spu, spu_function_t _ret) + auto sub = [](spu_thread* _spu) { _spu->ch_dec_start_timestamp = get_timebased_time(); - _ret(*_spu, _spu->_ptr(0), nullptr); }; - Label ret = c->newLabel(); - c->lea(*ls, x86::qword_ptr(ret)); - c->jmp(imm_ptr(sub)); - c->align(kAlignCode, 16); - c->bind(ret); + c->mov(*arg0, *cpu); + c->call(imm_ptr(sub)); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); c->mov(SPU_OFF_32(ch_dec_value), qw0->r32()); return; @@ -2555,13 +2514,12 @@ void spu_recompiler::WRCH(spu_opcode_t op) } } - Label ret = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(ls->r32(), op.ra); + c->lea(addr->r64(), get_pc(m_pos)); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(arg1->r32(), op.ra); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->lea(*qw1, x86::qword_ptr(ret)); - c->jmp(imm_ptr(spu_wrch)); - c->bind(ret); + c->mov(*arg0, *cpu); + c->call(imm_ptr(spu_wrch)); } void spu_recompiler::BIZ(spu_opcode_t op) @@ -2677,7 +2635,9 @@ void spu_recompiler::BISL(spu_opcode_t op) c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); c->and_(*addr, 0x3fffc); const XmmLink& vr = XmmAlloc(); - c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0))); + c->lea(*qw0, get_pc(m_pos + 4)); + c->movd(vr, qw0->r32()); + c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); branch_set_link(m_pos + 4); branch_indirect(op, true, false); @@ -2698,7 +2658,9 @@ void spu_recompiler::BISLED(spu_opcode_t op) c->and_(*addr, 0x3fffc); const XmmLink& vr = XmmAlloc(); - c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0))); + c->lea(*qw0, get_pc(m_pos + 4)); + c->movd(vr, qw0->r32()); + c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); asmjit::Label branch_label = c->newLabel(); @@ -3207,7 +3169,7 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op) void spu_recompiler::ROTQMBYI(spu_opcode_t op) { - const int s = 0-op.i7 & 0x1f; + const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrldq(va, s); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -3308,7 +3270,8 @@ void spu_recompiler::HGT(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); @@ -3561,7 +3524,6 @@ void spu_recompiler::FCMGT(spu_opcode_t op) // reverted less-than // since comparison is absoulte, a > b if a is extended and b is not extended // flush denormals to zero to make zero == zero work - const auto last_exp_bit = XmmConst(_mm_set1_epi32(0x00800000)); const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000)); const auto remove_sign_bits = XmmConst(_mm_set1_epi32(0x7fffffff)); @@ -3649,7 +3611,8 @@ void spu_recompiler::HLGT(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); @@ -3941,7 +3904,8 @@ void spu_recompiler::HEQ(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); @@ -4131,11 +4095,14 @@ void spu_recompiler::BRHNZ(spu_opcode_t op) void spu_recompiler::STQR(spu_opcode_t op) { + c->lea(addr->r64(), get_pc(spu_ls_target(m_pos, op.i16))); + c->and_(*addr, 0x3fff0); + if (utils::has_ssse3()) { const XmmLink& vt = XmmGet(op.rt, XmmType::Int); c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt); + c->movdqa(asmjit::x86::oword_ptr(*ls, addr->r64()), vt); } else { @@ -4143,8 +4110,8 @@ void spu_recompiler::STQR(spu_opcode_t op) c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); c->bswap(*qw0); c->bswap(*qw1); - c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0), *qw1); - c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8), *qw0); + c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1); + c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8), *qw0); } } @@ -4184,7 +4151,9 @@ void spu_recompiler::BRASL(spu_opcode_t op) const u32 target = spu_branch_target(0, op.i16); const XmmLink& vr = XmmAlloc(); - c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0))); + c->lea(addr->r64(), get_pc(m_pos + 4)); + c->movd(vr, *addr); + c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); if (target != m_pos + 4) @@ -4222,7 +4191,9 @@ void spu_recompiler::BRSL(spu_opcode_t op) const u32 target = spu_branch_target(m_pos, op.i16); const XmmLink& vr = XmmAlloc(); - c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0))); + c->lea(addr->r64(), get_pc(m_pos + 4)); + c->movd(vr, *addr); + c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); if (target != m_pos + 4) @@ -4235,17 +4206,20 @@ void spu_recompiler::BRSL(spu_opcode_t op) void spu_recompiler::LQR(spu_opcode_t op) { + c->lea(addr->r64(), get_pc(spu_ls_target(m_pos, op.i16))); + c->and_(*addr, 0x3fff0); + if (utils::has_ssse3()) { const XmmLink& vt = XmmAlloc(); - c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16))); + c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64())); c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } else { - c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0)); - c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8)); + c->mov(*qw0, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0)); + c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8)); c->bswap(*qw0); c->bswap(*qw1); c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); @@ -4455,7 +4429,8 @@ void spu_recompiler::HGTI(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); @@ -4498,7 +4473,8 @@ void spu_recompiler::HLGTI(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); @@ -4559,7 +4535,8 @@ void spu_recompiler::HEQI(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { c->bind(label); - c->mov(SPU_OFF_32(pc), pos); + c->lea(addr->r64(), get_pc(pos)); + c->mov(SPU_OFF_32(pc), *addr); c->lock().bts(SPU_OFF_32(status), 2); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "HALT"_u32); diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 6d8a906b13..f69b68e56f 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -19,18 +19,26 @@ private: // ASMJIT runtime ::jit_runtime m_asmrt; + u32 m_base; + // emitter: asmjit::X86Assembler* c; // arguments: const asmjit::X86Gp* cpu; const asmjit::X86Gp* ls; + const asmjit::X86Gp* rip; + const asmjit::X86Gp* pc0; + + // Native args or temp variables: + const asmjit::X86Gp* arg0; + const asmjit::X86Gp* arg1; const asmjit::X86Gp* qw0; const asmjit::X86Gp* qw1; // temporary: const asmjit::X86Gp* addr; - std::array vec; + std::array vec; // workload for the end of function: std::vector> after; @@ -81,12 +89,11 @@ private: asmjit::X86Mem XmmConst(__m128 data); asmjit::X86Mem XmmConst(__m128i data); + asmjit::X86Mem get_pc(u32 addr); void branch_fixed(u32 target); void branch_indirect(spu_opcode_t op, bool jt = false, bool ret = true); void branch_set_link(u32 target); void fall(spu_opcode_t op); - void save_rcx(); - void load_rcx(); void get_events(); diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 30df68ef8b..474a485a12 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -174,6 +174,16 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm([](asmjit:: c.ret(); }); +DECLARE(spu_runtime::g_escape) = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + // Restore native stack pointer (longjmp emulation) + c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp))); + c.sub(x86::rsp, 8); + c.ret(); +}); + DECLARE(spu_runtime::g_interpreter) = nullptr; spu_cache::spu_cache(const std::string& loc) @@ -2062,17 +2072,6 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en } } - // Skip some steps for asmjit - if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) - { - if (result.size() == 1) - { - result.clear(); - } - - return result; - } - // Fill block info for (auto& pred : m_preds) { @@ -4331,15 +4330,6 @@ public: m_ir->CreateRet(m_ir->CreateLoad(dispatcher)); } - // Longjmp analogue (restore saved host thread's stack pointer) - const auto escape = llvm::cast(m_module->getOrInsertFunction("spu_escape", get_ftype()).getCallee()); - escape->setLinkage(GlobalValue::InternalLinkage); - m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", escape)); - const auto load_sp = m_ir->CreateLoad(_ptr(&*escape->arg_begin(), ::offset32(&spu_thread::saved_native_sp))); - const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); - m_ir->CreateCall(get_intrinsic(Intrinsic::write_register), {rsp_name, m_ir->CreateSub(load_sp, m_ir->getInt64(8))}); - m_ir->CreateRetVoid(); - // Function that executes check_state and escapes if necessary m_test_state = llvm::cast(m_module->getOrInsertFunction("spu_test_state", get_ftype()).getCallee()); m_test_state->setLinkage(GlobalValue::InternalLinkage); @@ -4349,7 +4339,7 @@ public: const auto escape_no = BasicBlock::Create(m_context, "", m_test_state); m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, &*m_test_state->arg_begin()), escape_yes, escape_no); m_ir->SetInsertPoint(escape_yes); - m_ir->CreateCall(escape, {&*m_test_state->arg_begin()}); + call("spu_escape", spu_runtime::g_escape, &*m_test_state->arg_begin()); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(escape_no); m_ir->CreateRetVoid(); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index b9f488d6cf..f14e1c167f 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -105,6 +105,9 @@ public: // Recompiler entry point static const spu_function_t g_gateway; + // Longjmp to the end of the gateway function (native CC) + static void(*const g_escape)(spu_thread*); + // Interpreter entry point static spu_function_t g_interpreter;