From b68bdafadc6b3d0d942c9ca4576937c75c5d31e6 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 8 Nov 2020 00:14:53 +0300 Subject: [PATCH] vm: refactor vm::range_lock again Move bits to the highest, set RWX order. Use only one reserved value (W = locked). Assume lock size 128 for range_locked. Add new "Size" template argument that replaces normal argument. --- rpcs3/Emu/Cell/SPUThread.cpp | 8 ++--- rpcs3/Emu/Memory/vm.cpp | 68 ++++++++++++++++++++++------------- rpcs3/Emu/Memory/vm_locking.h | 40 +++++++++++++-------- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 67fe63d850..0d67a46411 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -2167,28 +2167,28 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* { case 1: { - vm::range_lock(range_lock, eal, 1); + vm::range_lock(range_lock, eal, 1); *reinterpret_cast(dst) = *reinterpret_cast(src); range_lock->release(0); break; } case 2: { - vm::range_lock(range_lock, eal, 2); + vm::range_lock(range_lock, eal, 2); *reinterpret_cast(dst) = *reinterpret_cast(src); range_lock->release(0); break; } case 4: { - vm::range_lock(range_lock, eal, 4); + vm::range_lock(range_lock, eal, 4); *reinterpret_cast(dst) = *reinterpret_cast(src); range_lock->release(0); break; } case 8: { - vm::range_lock(range_lock, eal, 8); + vm::range_lock(range_lock, eal, 8); *reinterpret_cast(dst) = *reinterpret_cast(src); range_lock->release(0); break; diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 25babbf44c..b373fe66b8 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -167,17 +167,24 @@ namespace vm { const u64 lock_val = g_range_lock.load(); const u64 is_shared = g_shareable[begin >> 16].load(); - const u64 lock_addr = static_cast(lock_val); // -> u64 - const u32 lock_size = static_cast(lock_val >> 35); + + u64 lock_addr = static_cast(lock_val); // -> u64 + u32 lock_size = static_cast(lock_val << range_bits >> (range_bits + 32)); u64 addr = begin; - if (is_shared) + if ((lock_val & range_full_mask) == range_locked) [[likely]] { - addr = addr & 0xffff; + lock_size = 128; + + if (is_shared) + { + addr = addr & 0xffff; + lock_addr = lock_val << 3 >> 3; + } } - if ((lock_val & range_full_mask) != range_locked || addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]] + if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]] { range_lock->store(begin | (u64{size} << 32)); @@ -249,11 +256,11 @@ namespace vm } template - FORCE_INLINE static u64 for_all_range_locks(F func) + FORCE_INLINE static u64 for_all_range_locks(u64 input, F func) { - u64 result = 0; + u64 result = input; - for (u64 bits = g_range_lock_bits.load(); bits; bits &= bits - 1) + for (u64 bits = input; bits; bits &= bits - 1) { const u32 id = std::countr_zero(bits); @@ -263,8 +270,13 @@ namespace vm { const u32 addr = static_cast(lock_val); - result += func(addr, size); + if (func(addr, size)) [[unlikely]] + { + continue; + } } + + result &= ~(1ull << id); } return result; @@ -287,20 +299,20 @@ namespace vm } // Block or signal new range locks - g_range_lock = addr | u64{size} << 35 | flags; + g_range_lock = addr | u64{size} << 32 | flags; + + _mm_prefetch(g_range_lock_set + 0, _MM_HINT_T0); + _mm_prefetch(g_range_lock_set + 2, _MM_HINT_T0); + _mm_prefetch(g_range_lock_set + 4, _MM_HINT_T0); const auto range = utils::address_range::start_length(addr, size); - while (true) - { - const u64 bads = for_all_range_locks([&](u32 addr2, u32 size2) - { - // TODO (currently not possible): handle 2 64K pages (inverse range), or more pages - if (g_shareable[addr2 >> 16]) - { - addr2 &= 0xffff; - } + u64 to_clear = g_range_lock_bits.load(); + while (to_clear) + { + to_clear = for_all_range_locks(to_clear, [&](u32 addr2, u32 size2) + { ASSUME(size2); if (range.overlaps(utils::address_range::start_length(addr2, size2))) [[unlikely]] @@ -311,7 +323,7 @@ namespace vm return 0; }); - if (!bads) [[likely]] + if (!to_clear) [[likely]] { break; } @@ -477,22 +489,28 @@ namespace vm } } - if (g_shareable[addr >> 16]) + if (g_shareable[addr >> 16]) [[unlikely]] { // Reservation address in shareable memory range addr = addr & 0xffff; } - g_range_lock = addr | (u64{128} << 35) | range_locked; + g_range_lock = addr | range_locked; + + _mm_prefetch(g_range_lock_set + 0, _MM_HINT_T0); + _mm_prefetch(g_range_lock_set + 2, _MM_HINT_T0); + _mm_prefetch(g_range_lock_set + 4, _MM_HINT_T0); const auto range = utils::address_range::start_length(addr, 128); + u64 to_clear = g_range_lock_bits.load(); + while (true) { - const u64 bads = for_all_range_locks([&](u32 addr2, u32 size2) + to_clear = for_all_range_locks(to_clear, [&](u32 addr2, u32 size2) { // TODO (currently not possible): handle 2 64K pages (inverse range), or more pages - if (g_shareable[addr2 >> 16]) + if (g_shareable[addr2 >> 16]) [[unlikely]] { addr2 &= 0xffff; } @@ -507,7 +525,7 @@ namespace vm return 0; }); - if (!bads) [[likely]] + if (!to_clear) [[likely]] { break; } diff --git a/rpcs3/Emu/Memory/vm_locking.h b/rpcs3/Emu/Memory/vm_locking.h index d0d1e7ab5a..204822da58 100644 --- a/rpcs3/Emu/Memory/vm_locking.h +++ b/rpcs3/Emu/Memory/vm_locking.h @@ -13,18 +13,20 @@ namespace vm enum range_lock_flags : u64 { - /* flags (3 bits) */ + /* flags (3 bits, RWX) */ - range_readable = 1ull << 32, - range_writable = 2ull << 32, - range_executable = 4ull << 32, - range_full_mask = 7ull << 32, + range_readable = 4ull << 61, + range_writable = 2ull << 61, + range_executable = 1ull << 61, + range_full_mask = 7ull << 61, /* flag combinations with special meaning */ - range_normal = 3ull << 32, // R+W, testing as mask for zero can check no access - range_locked = 2ull << 32, // R+W as well, the only range flag that should block by address + range_locked = 1ull << 61, // R+W as well, but being exclusively accessed (size extends addr) range_allocation = 0, // Allocation, no safe access, g_shareable may change at ANY location + + range_pos = 61, + range_bits = 3, }; extern atomic_t g_range_lock; @@ -40,27 +42,35 @@ namespace vm void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size); // Lock memory range - template - FORCE_INLINE void range_lock(atomic_t* range_lock, u32 begin, u32 size) + template + FORCE_INLINE void range_lock(atomic_t* range_lock, u32 begin, u32 _size) { + const u32 size = Size ? Size : _size; const u64 lock_val = g_range_lock.load(); #ifndef _MSC_VER __asm__(""); // Tiny barrier #endif const u64 is_shared = g_shareable[begin >> 16].load(); - const u64 lock_addr = static_cast(lock_val); // -> u64 - const u32 lock_size = static_cast(lock_val >> 35); + + u64 lock_addr = static_cast(lock_val); // -> u64 + u32 lock_size = static_cast(lock_val << range_bits >> (32 + range_bits)); u64 addr = begin; // Optimization: if range_locked is not used, the addr check will always pass // Otherwise, g_shareable is unchanged and its value is reliable to read - if (is_shared) + if ((lock_val >> range_pos) == (range_locked >> range_pos)) [[likely]] { - addr = addr & 0xffff; + lock_size = 128; + + if (TouchMem && is_shared) [[unlikely]] + { + addr = addr & 0xffff; + lock_addr = lock_val << range_bits >> range_bits; + } } - if (addr + size <= lock_addr || addr >= lock_addr + lock_size || (TouchMem && ((lock_val >> 32) ^ (range_locked >> 32)) & (range_full_mask >> 32))) [[likely]] + if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]] { // Optimistic locking. // Note that we store the range we will be accessing, without any clamping. @@ -77,7 +87,7 @@ namespace vm range_lock->release(0); } } - + return; }