mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-29 22:20:48 +00:00
Remove XABORT in PPU/SPU transactions.
It's expensive for unknown reason. Simply XEND is usually much cheaper. Add some minor improvements. Use g_sudo_addr.
This commit is contained in:
parent
182a998cb6
commit
dc8252bb9f
@ -272,33 +272,13 @@ asmjit::Label asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::L
|
|||||||
Label begin = c.newLabel();
|
Label begin = c.newLabel();
|
||||||
c.jmp(begin);
|
c.jmp(begin);
|
||||||
c.bind(fall);
|
c.bind(fall);
|
||||||
|
c.add(ctr, 1);
|
||||||
|
|
||||||
if (less_than < 65)
|
// Don't repeat on zero status (may indicate syscall or interrupt)
|
||||||
{
|
c.test(x86::eax, x86::eax);
|
||||||
c.add(ctr, 1);
|
c.jz(fallback);
|
||||||
c.test(x86::eax, _XABORT_RETRY);
|
|
||||||
c.jz(fallback);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Don't repeat on explicit XABORT instruction (workaround)
|
|
||||||
c.test(x86::eax, _XABORT_EXPLICIT);
|
|
||||||
c.jnz(fallback);
|
|
||||||
|
|
||||||
// Don't repeat on weird zero status
|
|
||||||
c.test(x86::eax, x86::eax);
|
|
||||||
c.jz(fallback);
|
|
||||||
|
|
||||||
// Count an attempt without RETRY flag as 65 normal attempts and continue
|
|
||||||
c.push(x86::rax);
|
|
||||||
c.not_(x86::eax);
|
|
||||||
c.and_(x86::eax, _XABORT_RETRY);
|
|
||||||
c.shl(x86::eax, 5);
|
|
||||||
c.add(x86::eax, 1); // eax = RETRY ? 1 : 65
|
|
||||||
c.add(ctr, x86::rax);
|
|
||||||
c.pop(x86::rax);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Other bad statuses are ignored regardless of repeat flag (TODO)
|
||||||
c.cmp(ctr, less_than);
|
c.cmp(ctr, less_than);
|
||||||
c.jae(fallback);
|
c.jae(fallback);
|
||||||
c.align(kAlignCode, 16);
|
c.align(kAlignCode, 16);
|
||||||
@ -309,13 +289,6 @@ asmjit::Label asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::L
|
|||||||
//c.xbegin(fall);
|
//c.xbegin(fall);
|
||||||
}
|
}
|
||||||
|
|
||||||
void asmjit::build_transaction_abort(asmjit::X86Assembler& c, unsigned char code)
|
|
||||||
{
|
|
||||||
c.db(0xc6);
|
|
||||||
c.db(0xf8);
|
|
||||||
c.db(code);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef LLVM_AVAILABLE
|
#ifdef LLVM_AVAILABLE
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
@ -55,11 +55,8 @@ namespace asmjit
|
|||||||
// Should only be used to build global functions
|
// Should only be used to build global functions
|
||||||
asmjit::Runtime& get_global_runtime();
|
asmjit::Runtime& get_global_runtime();
|
||||||
|
|
||||||
// Emit xbegin and adjacent loop, return label at xbegin
|
// Emit xbegin and adjacent loop, return label at xbegin (don't use xabort please)
|
||||||
[[nodiscard]] asmjit::Label build_transaction_enter(X86Assembler& c, Label fallback, const X86Gp& ctr, uint less_than);
|
[[nodiscard]] asmjit::Label build_transaction_enter(X86Assembler& c, Label fallback, const X86Gp& ctr, uint less_than);
|
||||||
|
|
||||||
// Emit xabort
|
|
||||||
void build_transaction_abort(X86Assembler& c, unsigned char code);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build runtime function with asmjit::X86Assembler
|
// Build runtime function with asmjit::X86Assembler
|
||||||
|
@ -1340,7 +1340,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
|
|
||||||
// Prepare registers
|
// Prepare registers
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||||
c.and_(x86::rbp, -128);
|
c.and_(x86::rbp, -128);
|
||||||
@ -1350,8 +1350,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
c.shr(args[0].r32(), 1);
|
c.shr(args[0].r32(), 1);
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||||
c.and_(x86::rbx, -128 / 2);
|
c.and_(x86::rbx, -128 / 2);
|
||||||
|
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||||
c.and_(args[0].r32(), 63);
|
c.and_(args[0].r32(), 63);
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.mov(x86::r12d, 1);
|
||||||
c.mov(x86::r13, args[1]);
|
c.mov(x86::r13, args[1]);
|
||||||
c.bswap(args[3]);
|
c.bswap(args[3]);
|
||||||
|
|
||||||
@ -1376,7 +1377,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Begin transaction
|
// Begin transaction
|
||||||
Label tx0 = build_transaction_enter(c, fall, x86::r12, 4);
|
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4);
|
||||||
c.xbegin(tx0);
|
c.xbegin(tx0);
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||||
c.test(x86::eax, 127);
|
c.test(x86::eax, 127);
|
||||||
@ -1423,22 +1424,46 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
// Update reservation
|
// Update reservation
|
||||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||||
c.xend();
|
c.xend();
|
||||||
c.mov(x86::eax, 1);
|
c.mov(x86::eax, x86::r12d);
|
||||||
|
c.jmp(_ret);
|
||||||
|
|
||||||
|
// XABORT is expensive so finish with xend instead
|
||||||
|
c.bind(fail);
|
||||||
|
|
||||||
|
// Load old data (unused)
|
||||||
|
if (s_tsx_avx)
|
||||||
|
{
|
||||||
|
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||||
|
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||||
|
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||||
|
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||||
|
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||||
|
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||||
|
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||||
|
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||||
|
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||||
|
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||||
|
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||||
|
}
|
||||||
|
|
||||||
|
c.xend();
|
||||||
|
c.xor_(x86::eax, x86::eax);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(skip);
|
c.bind(skip);
|
||||||
c.xor_(x86::eax, x86::eax);
|
c.xend();
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||||
build_transaction_abort(c, 0);
|
|
||||||
//c.jmp(fall);
|
//c.jmp(fall);
|
||||||
|
|
||||||
c.bind(fall);
|
c.bind(fall);
|
||||||
c.sar(x86::eax, 24);
|
|
||||||
c.js(fail);
|
|
||||||
|
|
||||||
// Touch memory if transaction failed without RETRY flag on the first attempt
|
// Touch memory if transaction failed with status 0
|
||||||
c.cmp(x86::r12, 1);
|
c.test(x86::eax, x86::eax);
|
||||||
c.jne(next);
|
c.jnz(next);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
@ -1454,19 +1479,19 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
c.mov(x86::eax, 1);
|
c.mov(x86::eax, 1);
|
||||||
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
|
||||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||||
c.jnz(fail3);
|
c.jnz(fall2);
|
||||||
|
|
||||||
// Allow only first shared lock to proceed
|
// Allow only first shared lock to proceed
|
||||||
c.cmp(x86::rax, x86::r13);
|
c.cmp(x86::rax, x86::r13);
|
||||||
c.jne(fail2);
|
c.jne(fail2);
|
||||||
|
|
||||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12, 666);
|
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666);
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||||
|
|
||||||
// Check pause flag
|
// Check pause flag
|
||||||
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||||
c.jc(fail3);
|
c.jc(fall2);
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||||
c.and_(x86::rax, -128);
|
c.and_(x86::rax, -128);
|
||||||
c.cmp(x86::rax, x86::r13);
|
c.cmp(x86::rax, x86::r13);
|
||||||
@ -1504,30 +1529,47 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
|||||||
c.ptest(x86::xmm0, x86::xmm0);
|
c.ptest(x86::xmm0, x86::xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
c.jnz(fail2);
|
c.jnz(fail3);
|
||||||
|
|
||||||
// Store 8 bytes
|
// Store 8 bytes
|
||||||
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
|
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
|
||||||
|
|
||||||
c.xend();
|
c.xend();
|
||||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||||
c.lea(x86::rax, x86::qword_ptr(x86::r12, 1));
|
c.mov(x86::eax, x86::r12d);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(fall2);
|
// XABORT is expensive so try to finish with xend instead
|
||||||
c.sar(x86::eax, 24);
|
|
||||||
c.js(fail2);
|
|
||||||
c.bind(fail3);
|
c.bind(fail3);
|
||||||
|
|
||||||
|
// Load old data (unused)
|
||||||
|
if (s_tsx_avx)
|
||||||
|
{
|
||||||
|
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||||
|
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||||
|
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||||
|
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||||
|
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||||
|
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||||
|
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||||
|
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||||
|
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||||
|
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||||
|
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||||
|
}
|
||||||
|
|
||||||
|
c.xend();
|
||||||
|
c.jmp(fail2);
|
||||||
|
|
||||||
|
c.bind(fall2);
|
||||||
c.mov(x86::eax, -1);
|
c.mov(x86::eax, -1);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(fail);
|
|
||||||
build_transaction_abort(c, 0xff);
|
|
||||||
c.xor_(x86::eax, x86::eax);
|
|
||||||
c.jmp(_ret);
|
|
||||||
|
|
||||||
c.bind(fail2);
|
c.bind(fail2);
|
||||||
build_transaction_abort(c, 0xff);
|
|
||||||
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
||||||
c.xor_(x86::eax, x86::eax);
|
c.xor_(x86::eax, x86::eax);
|
||||||
//c.jmp(_ret);
|
//c.jmp(_ret);
|
||||||
|
@ -308,7 +308,7 @@ namespace spu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
|
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||||
{
|
{
|
||||||
using namespace asmjit;
|
using namespace asmjit;
|
||||||
|
|
||||||
@ -352,7 +352,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
|
|
||||||
// Prepare registers
|
// Prepare registers
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||||
@ -360,7 +360,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
c.and_(args[0].r32(), 0xff80);
|
c.and_(args[0].r32(), 0xff80);
|
||||||
c.shr(args[0].r32(), 1);
|
c.shr(args[0].r32(), 1);
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||||
|
c.mov(x86::r12d, 1);
|
||||||
c.mov(x86::r13, args[1]);
|
c.mov(x86::r13, args[1]);
|
||||||
|
|
||||||
// Prepare data
|
// Prepare data
|
||||||
@ -396,7 +397,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Begin transaction
|
// Begin transaction
|
||||||
Label tx0 = build_transaction_enter(c, fall, x86::r12, 4);
|
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4);
|
||||||
c.xbegin(tx0);
|
c.xbegin(tx0);
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||||
c.test(x86::eax, 127);
|
c.test(x86::eax, 127);
|
||||||
@ -458,22 +459,46 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
|
|
||||||
c.sub(x86::qword_ptr(x86::rbx), -128);
|
c.sub(x86::qword_ptr(x86::rbx), -128);
|
||||||
c.xend();
|
c.xend();
|
||||||
c.mov(x86::eax, 1);
|
c.mov(x86::eax, x86::r12d);
|
||||||
|
c.jmp(_ret);
|
||||||
|
|
||||||
|
// XABORT is expensive so finish with xend instead
|
||||||
|
c.bind(fail);
|
||||||
|
|
||||||
|
// Load old data (unused)
|
||||||
|
if (s_tsx_avx)
|
||||||
|
{
|
||||||
|
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||||
|
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||||
|
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||||
|
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||||
|
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||||
|
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||||
|
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||||
|
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||||
|
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||||
|
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||||
|
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||||
|
}
|
||||||
|
|
||||||
|
c.xend();
|
||||||
|
c.xor_(x86::eax, x86::eax);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(skip);
|
c.bind(skip);
|
||||||
c.xor_(x86::eax, x86::eax);
|
c.xend();
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.mov(x86::eax, _XABORT_EXPLICIT);
|
||||||
build_transaction_abort(c, 0);
|
|
||||||
//c.jmp(fall);
|
//c.jmp(fall);
|
||||||
|
|
||||||
c.bind(fall);
|
c.bind(fall);
|
||||||
c.sar(x86::eax, 24);
|
|
||||||
c.js(fail);
|
|
||||||
|
|
||||||
// Touch memory if transaction failed without RETRY flag on the first attempt
|
// Touch memory if transaction failed with status 0
|
||||||
c.cmp(x86::r12, 1);
|
c.test(x86::eax, x86::eax);
|
||||||
c.jne(next);
|
c.jnz(next);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
@ -495,13 +520,13 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
c.cmp(x86::rax, x86::r13);
|
c.cmp(x86::rax, x86::r13);
|
||||||
c.jne(fail2);
|
c.jne(fail2);
|
||||||
|
|
||||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12, 666);
|
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666);
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||||
|
|
||||||
// Check pause flag
|
// Check pause flag
|
||||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||||
c.jc(fail3);
|
c.jc(fall2);
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||||
c.and_(x86::rax, -128);
|
c.and_(x86::rax, -128);
|
||||||
c.cmp(x86::rax, x86::r13);
|
c.cmp(x86::rax, x86::r13);
|
||||||
@ -539,7 +564,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
c.ptest(x86::xmm0, x86::xmm0);
|
c.ptest(x86::xmm0, x86::xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
c.jnz(fail2);
|
c.jnz(fail3);
|
||||||
|
|
||||||
if (s_tsx_avx)
|
if (s_tsx_avx)
|
||||||
{
|
{
|
||||||
@ -562,23 +587,40 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
|||||||
|
|
||||||
c.xend();
|
c.xend();
|
||||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||||
c.lea(x86::rax, x86::qword_ptr(x86::r12, 1));
|
c.mov(x86::eax, x86::r12d);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(fall2);
|
// XABORT is expensive so try to finish with xend instead
|
||||||
c.sar(x86::eax, 24);
|
|
||||||
c.js(fail2);
|
|
||||||
c.bind(fail3);
|
c.bind(fail3);
|
||||||
|
|
||||||
|
// Load old data (unused)
|
||||||
|
if (s_tsx_avx)
|
||||||
|
{
|
||||||
|
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||||
|
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||||
|
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||||
|
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||||
|
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||||
|
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||||
|
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||||
|
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||||
|
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||||
|
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||||
|
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||||
|
}
|
||||||
|
|
||||||
|
c.xend();
|
||||||
|
c.jmp(fail2);
|
||||||
|
|
||||||
|
c.bind(fall2);
|
||||||
c.mov(x86::eax, -1);
|
c.mov(x86::eax, -1);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(fail);
|
|
||||||
build_transaction_abort(c, 0xff);
|
|
||||||
c.xor_(x86::eax, x86::eax);
|
|
||||||
c.jmp(_ret);
|
|
||||||
|
|
||||||
c.bind(fail2);
|
c.bind(fail2);
|
||||||
build_transaction_abort(c, 0xff);
|
|
||||||
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
||||||
c.xor_(x86::eax, x86::eax);
|
c.xor_(x86::eax, x86::eax);
|
||||||
//c.jmp(_ret);
|
//c.jmp(_ret);
|
||||||
@ -649,7 +691,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
|
|
||||||
// Prepare registers
|
// Prepare registers
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||||
@ -657,7 +699,8 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
c.and_(args[0].r32(), 0xff80);
|
c.and_(args[0].r32(), 0xff80);
|
||||||
c.shr(args[0].r32(), 1);
|
c.shr(args[0].r32(), 1);
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||||
|
c.mov(x86::r12d, 1);
|
||||||
c.mov(x86::r13, args[1]);
|
c.mov(x86::r13, args[1]);
|
||||||
|
|
||||||
// Prepare data
|
// Prepare data
|
||||||
@ -681,7 +724,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Begin transaction
|
// Begin transaction
|
||||||
Label tx0 = build_transaction_enter(c, fall, x86::r12, 8);
|
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8);
|
||||||
c.xbegin(tx0);
|
c.xbegin(tx0);
|
||||||
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
|
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
|
||||||
c.jnz(skip);
|
c.jnz(skip);
|
||||||
@ -711,16 +754,14 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(skip);
|
c.bind(skip);
|
||||||
c.xor_(x86::eax, x86::eax);
|
c.xend();
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
|
||||||
build_transaction_abort(c, 0);
|
|
||||||
//c.jmp(fall);
|
//c.jmp(fall);
|
||||||
|
|
||||||
c.bind(fall);
|
c.bind(fall);
|
||||||
|
|
||||||
// Touch memory if transaction failed without RETRY flag on the first attempt
|
// Touch memory if transaction failed with status 0
|
||||||
c.cmp(x86::r12, 1);
|
c.test(x86::eax, x86::eax);
|
||||||
c.jne(next);
|
c.jnz(next);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
c.lock().add(x86::dword_ptr(x86::rbp), 0);
|
||||||
c.xor_(x86::rbp, 0xf80);
|
c.xor_(x86::rbp, 0xf80);
|
||||||
@ -736,7 +777,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
c.test(x86::eax, vm::rsrv_unique_lock);
|
c.test(x86::eax, vm::rsrv_unique_lock);
|
||||||
c.jnz(fall2);
|
c.jnz(fall2);
|
||||||
|
|
||||||
Label tx1 = build_transaction_enter(c, fall2, x86::r12, 666);
|
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666);
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||||
|
|
||||||
@ -766,7 +807,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
|||||||
|
|
||||||
c.xend();
|
c.xend();
|
||||||
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
c.lock().add(x86::qword_ptr(x86::rbx), 127);
|
||||||
c.lea(x86::rax, x86::qword_ptr(x86::r12, 1));
|
c.mov(x86::eax, x86::r12d);
|
||||||
c.jmp(_ret);
|
c.jmp(_ret);
|
||||||
|
|
||||||
c.bind(fall2);
|
c.bind(fall2);
|
||||||
@ -824,17 +865,17 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
|
|||||||
|
|
||||||
// Prepare registers
|
// Prepare registers
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||||
c.and_(args[0].r32(), 0xff80);
|
c.and_(args[0].r32(), 0xff80);
|
||||||
c.shr(args[0].r32(), 1);
|
c.shr(args[0].r32(), 1);
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
c.mov(x86::r12d, 1);
|
||||||
c.mov(x86::r13, args[1]);
|
c.mov(x86::r13, args[1]);
|
||||||
|
|
||||||
// Begin transaction
|
// Begin transaction
|
||||||
Label tx0 = build_transaction_enter(c, fall, x86::r12, 8);
|
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8);
|
||||||
|
|
||||||
// Check pause flag
|
// Check pause flag
|
||||||
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||||
|
@ -211,11 +211,15 @@ static const auto commit_tx = build_function_asm<s32(*)(const stx::multi_cas_ite
|
|||||||
|
|
||||||
// Transaction abort
|
// Transaction abort
|
||||||
c.bind(stop);
|
c.bind(stop);
|
||||||
build_transaction_abort(c, 0xff);
|
c.xend();
|
||||||
|
c.xor_(x86::eax, x86::eax);
|
||||||
|
c.jmp(fall);
|
||||||
|
|
||||||
// Abort when there is still a chance of success
|
// Abort when there is still a chance of success
|
||||||
c.bind(wait);
|
c.bind(wait);
|
||||||
build_transaction_abort(c, 0x00);
|
c.xend();
|
||||||
|
c.mov(x86::eax, 0xffu << 24);
|
||||||
|
c.jmp(fall);
|
||||||
|
|
||||||
// Transaction fallback: return zero
|
// Transaction fallback: return zero
|
||||||
c.bind(fall);
|
c.bind(fall);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user