From c491b73f3a7bc5dfd17ed8c96b44f39ae11e72e0 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 27 Oct 2020 21:34:08 +0300 Subject: [PATCH] SPU: improve accurate DMA Remove vm::reservation_lock from it. Use lock bits to prevent memory clobbering in GETLLAR. Improve u128 for MSVC since it's used for bitlocking. Improve 128 bit atomics for the same reason. Improve vm::reservation_op and friends. --- Utilities/types.h | 27 ++++++ rpcs3/Emu/CMakeLists.txt | 4 - rpcs3/Emu/Cell/PPUThread.cpp | 2 +- rpcs3/Emu/Cell/SPUThread.cpp | 132 ++++++++++++++++++++++++++++-- rpcs3/Emu/Memory/vm.cpp | 13 ++- rpcs3/Emu/Memory/vm_reservation.h | 36 ++++---- rpcs3/util/atomic.hpp | 83 +++++++++++++++++-- 7 files changed, 264 insertions(+), 33 deletions(-) diff --git a/Utilities/types.h b/Utilities/types.h index a040aa400c..cd99e4cb1a 100644 --- a/Utilities/types.h +++ b/Utilities/types.h @@ -304,6 +304,11 @@ struct alignas(16) u128 { } + constexpr explicit operator bool() const noexcept + { + return !!(lo | hi); + } + friend u128 operator+(const u128& l, const u128& r) { u128 value; @@ -384,6 +389,28 @@ struct alignas(16) u128 return value; } + u128 operator<<(u128 shift_value) + { + const u64 v0 = lo << (shift_value.lo & 63); + const u64 v1 = __shiftleft128(lo, hi, shift_value.lo); + + u128 value; + value.lo = (shift_value.lo & 64) ? 0 : v0; + value.hi = (shift_value.lo & 64) ? v0 : v1; + return value; + } + + u128 operator>>(u128 shift_value) + { + const u64 v0 = hi >> (shift_value.lo & 63); + const u64 v1 = __shiftright128(lo, hi, shift_value.lo); + + u128 value; + value.lo = (shift_value.lo & 64) ? v0 : v1; + value.hi = (shift_value.lo & 64) ? 0 : v0; + return value; + } + u128 operator~() const { u128 value; diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 8c77f9aaa4..d6d089b949 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -339,10 +339,6 @@ target_link_libraries(rpcs3_emu PRIVATE 3rdparty::stblib 3rdparty::libpng) -if(CMAKE_COMPILER_IS_GNUCXX) - target_link_libraries(rpcs3_emu PUBLIC -latomic) -endif() - # CPU target_sources(rpcs3_emu PRIVATE diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 51d759f02b..7a9ed9e56c 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1199,7 +1199,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) verify(HERE), cpu_thread::if_suspended<-1>(&ppu, [&]() { // Guaranteed success - ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128; + ppu.rtime = vm::reservation_acquire(addr, sizeof(T)); mov_rdata(ppu.rdata, *vm::get_super_ptr(addr & -128)); }); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 95753f9f2e..823fa8cdea 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1573,7 +1573,10 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } // Keep src point to const - auto [dst, src] = [&]() -> std::pair + u8* dst = nullptr; + const u8* src = nullptr; + + std::tie(dst, src) = [&]() -> std::pair { u8* dst = vm::_ptr(eal); u8* src = _ptr(lsa); @@ -1596,6 +1599,15 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]] { + perf_meter<"ADMA_GET"_u64> perf_get; + perf_meter<"ADMA_PUT"_u64> perf_put = perf_get; + + if (!g_cfg.core.spu_accurate_dma) [[likely]] + { + perf_put.reset(); + perf_get.reset(); + } + for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0) { @@ -1613,7 +1625,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } else { + state += cpu_flag::wait + cpu_flag::temp; std::this_thread::yield(); + check_state(); } }()) { @@ -1728,8 +1742,102 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) continue; } - // Lock each cache line execlusively - auto [res, time0] = vm::reservation_lock(eal); + // Lock each cache line + auto& res = vm::reservation_acquire(eal, size0); + + // Lock each bit corresponding to a byte being written, using some free space in reservation memory + auto* bits = reinterpret_cast*>(vm::g_reservations + (eal & 0xff80) / 2 + 16); + + // Get writing mask + const u128 wmask = (~u128{} << (eal & 127)) & (~u128{} >> (127 - ((eal + size0 - 1) & 127))); + //const u64 start = (eal & 127) / 2; + //const u64 _end_ = ((eal + size0 - 1) & 127) / 2; + //const u64 wmask = (UINT64_MAX << start) & (UINT64_MAX >> (63 - _end_)); + + u128 old = 0; + + for (u64 i = 0; i != umax; [&]() + { + if (state & cpu_flag::pause) + { + cpu_thread::if_suspended(this, [&] + { + std::memcpy(dst, src, size0); + res += 128; + }); + + // Exit loop and function + i = -1; + bits = nullptr; + return; + } + else if (++i < 10) + { + busy_wait(500); + } + else + { + // Wait + state += cpu_flag::wait + cpu_flag::temp; + bits->wait(old, wmask); + check_state(); + } + }()) + { + // Completed in suspend_all() + if (!bits) + { + break; + } + + bool ok = false; + + std::tie(old, ok) = bits->fetch_op([&](auto& v) + { + if (v & wmask) + { + return false; + } + + v |= wmask; + return true; + }); + + if (ok) [[likely]] + { + break; + } + } + + if (!bits) + { + if (size == size0) + { + break; + } + + continue; + } + + // Lock reservation (shared) + auto [_oldd, _ok] = res.fetch_op([&](u64& r) + { + if (r & vm::rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (!_ok) + { + vm::reservation_shared_lock_internal(res); + } + + // Obtain range lock as normal store + vm::range_lock(range_lock, eal, size0); switch (size0) { @@ -1777,7 +1885,17 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - res += 64; + range_lock->release(0); + + res += 127; + + // Release bits and notify + bits->atomic_op([&](auto& v) + { + v &= ~wmask; + }); + + bits->notify_all(); if (size == size0) { @@ -1785,10 +1903,12 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - std::atomic_thread_fence(std::memory_order_seq_cst); + //std::atomic_thread_fence(std::memory_order_seq_cst); return; } + perf_meter<"DMA_PUT"_u64> perf2; + switch (u32 size = args.size) { case 1: @@ -2587,7 +2707,7 @@ bool spu_thread::process_mfc_cmd() continue; } - if (g_use_rtm && i >= 15 && g_cfg.core.perf_report) [[unlikely]] + if (i >= 15 && g_cfg.core.perf_report) [[unlikely]] { perf_log.warning("GETLLAR: took too long: %u", i); } diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index a8f3c93e7e..dacaad4456 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -510,7 +510,18 @@ namespace vm { for (u64 i = 0;; i++) { - if (!(res & rsrv_unique_lock)) [[likely]] + auto [_oldd, _ok] = res.fetch_op([&](u64& r) + { + if (r & rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (_ok) [[likely]] { return; } diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index 4c28571f3f..ff1b6579a5 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -87,11 +87,11 @@ namespace vm // Use 128-byte aligned addr const u32 addr = static_cast(ptr.addr()) & -128; + auto& res = vm::reservation_acquire(addr, 128); + _m_prefetchw(&res); + if (g_use_rtm) { - auto& res = vm::reservation_acquire(addr, 128); - _m_prefetchw(&res); - // Stage 1: single optimistic transaction attempt unsigned status = _XBEGIN_STARTED; unsigned count = 0; @@ -267,15 +267,15 @@ namespace vm } } - // Perform heavyweight lock - auto [res, rtime] = vm::reservation_lock(addr); + // Lock reservation and perform heavyweight lock + reservation_shared_lock_internal(res); if constexpr (std::is_void_v>) { { vm::writer_lock lock(addr); std::invoke(op, *sptr); - res += 64; + res += 127; } if constexpr (Ack) @@ -290,11 +290,11 @@ namespace vm if ((result = std::invoke(op, *sptr))) { - res += 64; + res += 127; } else { - res -= 64; + res -= 1; } } @@ -314,9 +314,6 @@ namespace vm // Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_peek: unsupported type"); - // Use "super" pointer to prevent access violation handling during atomic op - const auto sptr = vm::get_super_ptr(static_cast(ptr.addr())); - // Use 128-byte aligned addr const u32 addr = static_cast(ptr.addr()) & -128; @@ -340,7 +337,7 @@ namespace vm // Observe data non-atomically and make sure no reservation updates were made if constexpr (std::is_void_v>) { - std::invoke(op, *sptr); + std::invoke(op, *ptr); if (rtime == vm::reservation_acquire(addr, 128)) { @@ -349,7 +346,7 @@ namespace vm } else { - auto res = std::invoke(op, *sptr); + auto res = std::invoke(op, *ptr); if (rtime == vm::reservation_acquire(addr, 128)) { @@ -371,7 +368,18 @@ namespace vm // "Lock" reservation auto& res = vm::reservation_acquire(addr, 128); - if (res.fetch_add(1) & vm::rsrv_unique_lock) [[unlikely]] + auto [_old, _ok] = res.fetch_op([&](u64& r) + { + if (r & vm::rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (!_ok) [[unlikely]] { vm::reservation_shared_lock_internal(res); } diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index a134716f15..f885b1e190 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -639,6 +639,13 @@ template struct atomic_storage : atomic_storage { #ifdef _MSC_VER + static inline T load(const T& dest) + { + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + std::atomic_thread_fence(std::memory_order_acquire); + return std::bit_cast(val); + } + static inline bool compare_exchange(T& dest, T& comp, T exch) { struct alignas(16) llong2 { llong ll[2]; }; @@ -646,13 +653,6 @@ struct atomic_storage : atomic_storage return _InterlockedCompareExchange128(reinterpret_cast(&dest), _exch.ll[1], _exch.ll[0], reinterpret_cast(&comp)) != 0; } - static inline T load(const T& dest) - { - struct alignas(16) llong2 { llong ll[2]; } result{}; - _InterlockedCompareExchange128(reinterpret_cast(&const_cast(dest)), result.ll[1], result.ll[0], result.ll); - return std::bit_cast(result); - } - static inline T exchange(T& dest, T value) { struct alignas(16) llong2 { llong ll[2]; }; @@ -670,9 +670,78 @@ struct atomic_storage : atomic_storage } static inline void release(T& dest, T value) + { + std::atomic_thread_fence(std::memory_order_release); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + } +#else + static inline T load(const T& dest) + { + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + __atomic_thread_fence(__ATOMIC_ACQUIRE); + return std::bit_cast(val); + } + + static inline bool compare_exchange(T& dest, T& comp, T exch) + { + bool result; + ullong cmp_lo = 0; + ullong cmp_hi = 0; + ullong exc_lo = 0; + ullong exc_hi = 0; + + if constexpr (std::is_same_v || std::is_same_v) + { + cmp_lo = comp; + cmp_hi = comp >> 64; + exc_lo = exch; + exc_hi = exch >> 64; + } + else + { + std::memcpy(&cmp_lo, reinterpret_cast(&comp) + 0, 8); + std::memcpy(&cmp_hi, reinterpret_cast(&comp) + 8, 8); + std::memcpy(&exc_lo, reinterpret_cast(&exch) + 0, 8); + std::memcpy(&exc_hi, reinterpret_cast(&exch) + 8, 8); + } + + __asm__ volatile("lock cmpxchg16b %1;" + : "=@ccz" (result) + , "+m" (dest) + , "+d" (cmp_hi) + , "+a" (cmp_lo) + : "c" (exc_hi) + , "b" (exc_lo) + : "cc"); + + if constexpr (std::is_same_v || std::is_same_v) + { + comp = T{cmp_hi} << 64 | cmp_lo; + } + else + { + std::memcpy(reinterpret_cast(&comp) + 0, &cmp_lo, 8); + std::memcpy(reinterpret_cast(&comp) + 8, &cmp_hi, 8); + } + + return result; + } + + static inline T exchange(T& dest, T value) + { + return std::bit_cast(__sync_lock_test_and_set(reinterpret_cast(&dest), std::bit_cast(value))); + } + + static inline void store(T& dest, T value) { exchange(dest, value); } + + static inline void release(T& dest, T value) + { + __atomic_thread_fence(__ATOMIC_RELEASE); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + } #endif // TODO