From b16cc618b524f35fa60bf496fe01892f76c238b6 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 6 Dec 2020 12:10:00 +0300 Subject: [PATCH] atomic.hpp: add some features and optimizations Add atomic_t<>::observe() (relaxed load) Add atomic_fence_XXX() (barrier functions) Get rid of MFENCE instruction, replace with no-op LOCK OR on stack. Remove dependence from stdafx.h and relevant headers. --- rpcs3/Emu/Cell/Modules/cellSpurs.cpp | 2 +- rpcs3/Emu/Cell/Modules/cellSync.cpp | 14 +-- rpcs3/Emu/Cell/Modules/sys_lwmutex_.cpp | 4 +- rpcs3/Emu/Cell/PPUInterpreter.cpp | 6 +- rpcs3/Emu/Cell/PPUThread.cpp | 4 +- rpcs3/Emu/Cell/RawSPUThread.cpp | 2 +- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 4 +- rpcs3/Emu/Cell/SPUInterpreter.cpp | 4 +- rpcs3/Emu/Cell/SPUThread.cpp | 10 +- rpcs3/Emu/RSX/Capture/rsx_replay.cpp | 2 +- rpcs3/util/atomic.cpp | 2 +- rpcs3/util/atomic.hpp | 157 ++++++++++++++++++++---- rpcs3/util/logs.cpp | 12 +- rpcs3/util/logs.hpp | 8 +- 14 files changed, 171 insertions(+), 60 deletions(-) diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp index d94e893919..ec76e0a477 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp @@ -3029,7 +3029,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr spurs, u3 return CELL_SPURS_POLICY_MODULE_ERROR_STAT; } - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); struct alignas(128) wklFlagOp { diff --git a/rpcs3/Emu/Cell/Modules/cellSync.cpp b/rpcs3/Emu/Cell/Modules/cellSync.cpp index 04ea5418e7..132e473e7b 100644 --- a/rpcs3/Emu/Cell/Modules/cellSync.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSync.cpp @@ -85,7 +85,7 @@ error_code cellSyncMutexLock(ppu_thread& ppu, vm::ptr mutex) } } - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); return CELL_OK; } @@ -194,7 +194,7 @@ error_code cellSyncBarrierTryNotify(vm::ptr barrier) return CELL_SYNC_ERROR_ALIGN; } - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_notify)) { @@ -218,7 +218,7 @@ error_code cellSyncBarrierWait(ppu_thread& ppu, vm::ptr barrier return CELL_SYNC_ERROR_ALIGN; } - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); while (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait)) { @@ -245,7 +245,7 @@ error_code cellSyncBarrierTryWait(vm::ptr barrier) return CELL_SYNC_ERROR_ALIGN; } - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait)) { @@ -279,7 +279,7 @@ error_code cellSyncRwmInitialize(vm::ptr rwm, vm::ptr buffer, rwm->size = buffer_size; rwm->buffer = buffer; - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); return CELL_OK; } @@ -451,7 +451,7 @@ error_code cellSyncQueueInitialize(vm::ptr queue, vm::ptr buf queue->depth = depth; queue->buffer = buffer; - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); return CELL_OK; } @@ -863,7 +863,7 @@ error_code cellSyncLFQueueInitialize(vm::ptr queue, vm::cptr lwmutex, u64 // recursive locking succeeded lwmutex->recursive_count++; - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); return CELL_OK; } @@ -288,7 +288,7 @@ error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr lwmutex) // recursive locking succeeded lwmutex->recursive_count++; - std::atomic_thread_fence(std::memory_order_acq_rel); + atomic_fence_acq_rel(); return CELL_OK; } diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 0e33f2370d..72ef7d48e4 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -3156,7 +3156,7 @@ bool ppu_interpreter::CRANDC(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::ISYNC(ppu_thread& ppu, ppu_opcode_t op) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); return true; } @@ -4222,7 +4222,7 @@ bool ppu_interpreter::LFSUX(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::SYNC(ppu_thread& ppu, ppu_opcode_t op) { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); return true; } @@ -4432,7 +4432,7 @@ bool ppu_interpreter::SRADI(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::EIEIO(ppu_thread& ppu, ppu_opcode_t op) { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); return true; } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index cec21f4fc8..9889ffdff0 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1208,7 +1208,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) perf_meter<"LARX"_u32> perf0; // Do not allow stores accessed from the same cache line to past reservation load - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); if (addr % sizeof(T)) { @@ -1322,7 +1322,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) else { mov_rdata(ppu.rdata, vm::_ref(addr & -128)); - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); // Load relevant 64 bits of reservation data std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8); diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp index c5d3594a3f..148e022477 100644 --- a/rpcs3/Emu/Cell/RawSPUThread.cpp +++ b/rpcs3/Emu/Cell/RawSPUThread.cpp @@ -105,7 +105,7 @@ bool spu_thread::read_reg(const u32 addr, u32& value) case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); value = MFC_PPU_DMA_CMD_ENQUEUE_SUCCESSFUL; return true; } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index a47d996b9b..973170e1a4 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -1310,7 +1310,7 @@ void spu_recompiler::LNOP(spu_opcode_t op) void spu_recompiler::SYNC(spu_opcode_t op) { // This instruction must be used following a store instruction that modifies the instruction stream. - c->mfence(); + c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0); if (g_cfg.core.spu_block_size == spu_block_size_type::safe) { @@ -1325,7 +1325,7 @@ void spu_recompiler::SYNC(spu_opcode_t op) void spu_recompiler::DSYNC(spu_opcode_t op) { // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. - c->mfence(); + c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0); } void spu_recompiler::MFSPR(spu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 455288f39a..e86b711bce 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -141,14 +141,14 @@ bool spu_interpreter::LNOP(spu_thread& spu, spu_opcode_t op) // This instruction must be used following a store instruction that modifies the instruction stream. bool spu_interpreter::SYNC(spu_thread& spu, spu_opcode_t op) { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); return true; } // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. bool spu_interpreter::DSYNC(spu_thread& spu, spu_opcode_t op) { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); return true; } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 37da1d2297..f98adf7d46 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -2277,7 +2277,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* } } - //std::atomic_thread_fence(std::memory_order_seq_cst); + //atomic_fence_seq_cst(); return; } else @@ -2904,7 +2904,7 @@ void spu_thread::do_mfc(bool wait) if (&args - mfc_queue <= removed) { // Remove barrier-class command if it's the first in the queue - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); removed++; return true; } @@ -3130,7 +3130,7 @@ bool spu_thread::process_mfc_cmd() // Exit loop if (ok && (ntime & 127) == 0) { - _mm_mfence(); + atomic_fence_seq_cst(); i = -1; return; } @@ -3352,7 +3352,7 @@ bool spu_thread::process_mfc_cmd() { if (mfc_size == 0) { - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); } else { @@ -4420,7 +4420,7 @@ bool spu_thread::stop_and_signal(u32 code) fmt::throw_exception("STOP code 0x100: Out_MBox is not empty" HERE); } - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); return true; } diff --git a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp index a5816cc4d5..e9fa572a6e 100644 --- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp +++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp @@ -178,7 +178,7 @@ namespace rsx { // Load registers while the RSX is still idle method_registers = frame->reg_state; - std::atomic_thread_fence(std::memory_order_seq_cst); + atomic_fence_seq_cst(); // start up fifo buffer by dumping the put ptr to first stop sys_rsx_context_attribute(context_id, 0x001, 0x10000000, fifo_stops[0], 0, 0); diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index d8cf83424a..f3f545299a 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -1146,7 +1146,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time std::unique_lock lock(*cond->mtx.get()); #else if (ext_size) - _mm_mfence(); + atomic_fence_seq_cst(); #endif // Can skip unqueue process if true diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index 4b5b5b83a4..d412f60498 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -5,7 +5,59 @@ #include #ifdef _MSC_VER -#include +#pragma warning(push) +#pragma warning(disable: 4996) +#endif + +FORCE_INLINE void atomic_fence_consume() +{ +#ifdef _MSC_VER + _ReadWriteBarrier(); +#else + __atomic_thread_fence(__ATOMIC_CONSUME); +#endif +} + +FORCE_INLINE void atomic_fence_acquire() +{ +#ifdef _MSC_VER + _ReadWriteBarrier(); +#else + __atomic_thread_fence(__ATOMIC_ACQUIRE); +#endif +} + +FORCE_INLINE void atomic_fence_release() +{ +#ifdef _MSC_VER + _ReadWriteBarrier(); +#else + __atomic_thread_fence(__ATOMIC_RELEASE); +#endif +} + +FORCE_INLINE void atomic_fence_acq_rel() +{ +#ifdef _MSC_VER + _ReadWriteBarrier(); +#else + __atomic_thread_fence(__ATOMIC_ACQ_REL); +#endif +} + +FORCE_INLINE void atomic_fence_seq_cst() +{ +#ifdef _MSC_VER + _ReadWriteBarrier(); + _InterlockedOr(static_cast(_AddressOfReturnAddress()), 0); + _ReadWriteBarrier(); +#else + __asm__ volatile ("lock orl $0, 0(%%esp);" ::: "cc", "memory"); +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) #endif // Wait timeout extension (in nanoseconds) @@ -286,6 +338,13 @@ struct atomic_storage return result; } + static inline T observe(const T& dest) + { + T result; + __atomic_load(reinterpret_cast(&dest), reinterpret_cast(&result), __ATOMIC_RELAXED); + return result; + } + static inline void store(T& dest, T value) { static_cast(exchange(dest, value)); @@ -506,17 +565,23 @@ struct atomic_storage : atomic_storage static inline T load(const T& dest) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); + const char value = *reinterpret_cast(&dest); + atomic_fence_acquire(); + return std::bit_cast(value); + } + + static inline T observe(const T& dest) + { const char value = *reinterpret_cast(&dest); - std::atomic_thread_fence(std::memory_order_acquire); return std::bit_cast(value); } static inline void release(T& dest, T value) { - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); } static inline T exchange(T& dest, T value) @@ -570,17 +635,23 @@ struct atomic_storage : atomic_storage static inline T load(const T& dest) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); + const short value = *reinterpret_cast(&dest); + atomic_fence_acquire(); + return std::bit_cast(value); + } + + static inline T observe(const T& dest) + { const short value = *reinterpret_cast(&dest); - std::atomic_thread_fence(std::memory_order_acquire); return std::bit_cast(value); } static inline void release(T& dest, T value) { - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); } static inline T exchange(T& dest, T value) @@ -654,17 +725,23 @@ struct atomic_storage : atomic_storage static inline T load(const T& dest) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); + const long value = *reinterpret_cast(&dest); + atomic_fence_acquire(); + return std::bit_cast(value); + } + + static inline T observe(const T& dest) + { const long value = *reinterpret_cast(&dest); - std::atomic_thread_fence(std::memory_order_acquire); return std::bit_cast(value); } static inline void release(T& dest, T value) { - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); } static inline T exchange(T& dest, T value) @@ -744,17 +821,23 @@ struct atomic_storage : atomic_storage static inline T load(const T& dest) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); + const llong value = *reinterpret_cast(&dest); + atomic_fence_acquire(); + return std::bit_cast(value); + } + + static inline T observe(const T& dest) + { const llong value = *reinterpret_cast(&dest); - std::atomic_thread_fence(std::memory_order_acquire); return std::bit_cast(value); } static inline void release(T& dest, T value) { - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); } static inline T exchange(T& dest, T value) @@ -818,9 +901,18 @@ struct atomic_storage : atomic_storage #ifdef _MSC_VER static inline T load(const T& dest) { - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); __m128i val = _mm_load_si128(reinterpret_cast(&dest)); - std::atomic_thread_fence(std::memory_order_acquire); + atomic_fence_acquire(); + return std::bit_cast(val); + } + + static inline T observe(const T& dest) + { + // Barriers are kept intentionally + atomic_fence_acquire(); + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + atomic_fence_acquire(); return std::bit_cast(val); } @@ -844,14 +936,16 @@ struct atomic_storage : atomic_storage static inline void store(T& dest, T value) { - exchange(dest, value); + atomic_fence_acq_rel(); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + atomic_fence_seq_cst(); } static inline void release(T& dest, T value) { - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); - std::atomic_thread_fence(std::memory_order_release); + atomic_fence_release(); } #else static inline T load(const T& dest) @@ -862,6 +956,15 @@ struct atomic_storage : atomic_storage return std::bit_cast(val); } + static inline T observe(const T& dest) + { + // Barriers are kept intentionally + __atomic_thread_fence(__ATOMIC_ACQUIRE); + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + __atomic_thread_fence(__ATOMIC_ACQUIRE); + return std::bit_cast(val); + } + static inline bool compare_exchange(T& dest, T& comp, T exch) { bool result; @@ -915,7 +1018,9 @@ struct atomic_storage : atomic_storage static inline void store(T& dest, T value) { - exchange(dest, value); + __atomic_thread_fence(__ATOMIC_ACQ_REL); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + atomic_fence_seq_cst(); } static inline void release(T& dest, T value) @@ -1075,6 +1180,12 @@ public: return atomic_storage::load(m_data); } + // Relaxed load + type observe() const + { + return atomic_storage::observe(m_data); + } + // Atomically write data void store(const type& rhs) { diff --git a/rpcs3/util/logs.cpp b/rpcs3/util/logs.cpp index f46c432b45..2173776ead 100644 --- a/rpcs3/util/logs.cpp +++ b/rpcs3/util/logs.cpp @@ -168,7 +168,7 @@ namespace logs for (auto&& pair : get_logger()->channels) { - pair.second->enabled.store(level::notice, std::memory_order_relaxed); + pair.second->enabled.release(level::notice); } } @@ -178,7 +178,7 @@ namespace logs for (auto&& pair : get_logger()->channels) { - pair.second->enabled.store(level::always, std::memory_order_relaxed); + pair.second->enabled.release(level::always); } } @@ -190,7 +190,7 @@ namespace logs while (found.first != found.second) { - found.first->second->enabled.store(value, std::memory_order_relaxed); + found.first->second->enabled.release(value); found.first++; } } @@ -203,7 +203,7 @@ namespace logs if (found.first != found.second) { - return found.first->second->enabled.load(std::memory_order_relaxed); + return found.first->second->enabled.observe(); } else { @@ -275,7 +275,7 @@ logs::listener::~listener() for (auto&& pair : logger->channels) { - pair.second->enabled.store(level::always, std::memory_order_relaxed); + pair.second->enabled.release(level::always); } } } @@ -290,7 +290,7 @@ void logs::listener::add(logs::listener* _new) // Install new listener at the end of linked list listener* null = nullptr; - while (lis->m_next || !lis->m_next.compare_exchange_strong(null, _new)) + while (lis->m_next || !lis->m_next.compare_exchange(null, _new)) { lis = lis->m_next; null = nullptr; diff --git a/rpcs3/util/logs.hpp b/rpcs3/util/logs.hpp index cd6891aa59..e62f792fac 100644 --- a/rpcs3/util/logs.hpp +++ b/rpcs3/util/logs.hpp @@ -1,12 +1,12 @@ #pragma once // No BOM and only basic ASCII in this header, or a neko will die #include -#include #include #include #include #include #include +#include "util/atomic.hpp" #include "Utilities/StrFmt.h" namespace logs @@ -51,7 +51,7 @@ namespace logs class listener { // Next listener (linked list) - std::atomic m_next{}; + atomic_t m_next{}; friend struct message; @@ -76,7 +76,7 @@ namespace logs const char* const name; // The lowest logging level enabled for this channel (used for early filtering) - std::atomic enabled; + atomic_t enabled; // Initialize channel constexpr channel(const char* name) noexcept @@ -90,7 +90,7 @@ namespace logs template \ void _sev(const CharT(&fmt)[N], const Args&... args)\ {\ - if (level::_sev <= enabled.load(std::memory_order_relaxed)) [[unlikely]]\ + if (level::_sev <= enabled.observe()) [[unlikely]]\ {\ if constexpr (sizeof...(Args) > 0)\ {\