atomic.hpp: add some features and optimizations

Add atomic_t<>::observe() (relaxed load) Add atomic_fence_XXX() (barrier functions) Get rid of MFENCE instruction, replace with no-op LOCK OR on stack. Remove <atomic> dependence from stdafx.h and relevant headers.
2025-04-15 14:42:40 +00:00 · 2020-12-06 12:10:00 +03:00 · 2020-12-06 12:10:00 +03:00 · b16cc618b5
commit b16cc618b5
parent 77aa9e58f2
14 changed files with 171 additions and 60 deletions
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@ -3029,7 +3029,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	struct alignas(128) wklFlagOp
 	{
--- a/rpcs3/Emu/Cell/Modules/cellSync.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSync.cpp
@ -85,7 +85,7 @@ error_code cellSyncMutexLock(ppu_thread& ppu, vm::ptr<CellSyncMutex> mutex)
 		}
 	}

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();
 	return CELL_OK;
 }

@ -194,7 +194,7 @@ error_code cellSyncBarrierTryNotify(vm::ptr<CellSyncBarrier> barrier)
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_notify))
 	{
@ -218,7 +218,7 @@ error_code cellSyncBarrierWait(ppu_thread& ppu, vm::ptr<CellSyncBarrier> barrier
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	while (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait))
 	{
@ -245,7 +245,7 @@ error_code cellSyncBarrierTryWait(vm::ptr<CellSyncBarrier> barrier)
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait))
 	{
@ -279,7 +279,7 @@ error_code cellSyncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer,
 	rwm->size = buffer_size;
 	rwm->buffer = buffer;

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	return CELL_OK;
 }
@ -451,7 +451,7 @@ error_code cellSyncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buf
 	queue->depth = depth;
 	queue->buffer = buffer;

-	std::atomic_thread_fence(std::memory_order_acq_rel);
+	atomic_fence_acq_rel();

 	return CELL_OK;
 }
@ -863,7 +863,7 @@ error_code cellSyncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::cptr<vo
 			}
 		}

-		std::atomic_thread_fence(std::memory_order_acq_rel);
+		atomic_fence_acq_rel();
 	}
 	else
 	{
--- a/rpcs3/Emu/Cell/Modules/sys_lwmutex_.cpp
+++ b/rpcs3/Emu/Cell/Modules/sys_lwmutex_.cpp
@ -128,7 +128,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64

 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		std::atomic_thread_fence(std::memory_order_acq_rel);
+		atomic_fence_acq_rel();

 		return CELL_OK;
 	}
@ -288,7 +288,7 @@ error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex)

 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		std::atomic_thread_fence(std::memory_order_acq_rel);
+		atomic_fence_acq_rel();

 		return CELL_OK;
 	}
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -3156,7 +3156,7 @@ bool ppu_interpreter::CRANDC(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::ISYNC(ppu_thread& ppu, ppu_opcode_t op)
 {
-	std::atomic_thread_fence(std::memory_order_acquire);
+	atomic_fence_acquire();
 	return true;
 }

@ -4222,7 +4222,7 @@ bool ppu_interpreter::LFSUX(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::SYNC(ppu_thread& ppu, ppu_opcode_t op)
 {
-	std::atomic_thread_fence(std::memory_order_seq_cst);
+	atomic_fence_seq_cst();
 	return true;
 }

@ -4432,7 +4432,7 @@ bool ppu_interpreter::SRADI(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::EIEIO(ppu_thread& ppu, ppu_opcode_t op)
 {
-	std::atomic_thread_fence(std::memory_order_seq_cst);
+	atomic_fence_seq_cst();
 	return true;
 }

--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -1208,7 +1208,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 	perf_meter<"LARX"_u32> perf0;

 	// Do not allow stores accessed from the same cache line to past reservation load
-	std::atomic_thread_fence(std::memory_order_seq_cst);
+	atomic_fence_seq_cst();

 	if (addr % sizeof(T))
 	{
@ -1322,7 +1322,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 	else
 	{
 		mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();

 		// Load relevant 64 bits of reservation data
 		std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8);
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -105,7 +105,7 @@ bool spu_thread::read_reg(const u32 addr, u32& value)
 		case MFC_EIEIO_CMD:
 		case MFC_SYNC_CMD:
 		{
-			std::atomic_thread_fence(std::memory_order_seq_cst);
+			atomic_fence_seq_cst();
 			value = MFC_PPU_DMA_CMD_ENQUEUE_SUCCESSFUL;
 			return true;
 		}
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -1310,7 +1310,7 @@ void spu_recompiler::LNOP(spu_opcode_t op)
 void spu_recompiler::SYNC(spu_opcode_t op)
 {
 	// This instruction must be used following a store instruction that modifies the instruction stream.
-	c->mfence();
+	c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0);

 	if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
 	{
@ -1325,7 +1325,7 @@ void spu_recompiler::SYNC(spu_opcode_t op)
 void spu_recompiler::DSYNC(spu_opcode_t op)
 {
 	// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
-	c->mfence();
+	c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0);
 }

 void spu_recompiler::MFSPR(spu_opcode_t op)
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -141,14 +141,14 @@ bool spu_interpreter::LNOP(spu_thread& spu, spu_opcode_t op)
 // This instruction must be used following a store instruction that modifies the instruction stream.
 bool spu_interpreter::SYNC(spu_thread& spu, spu_opcode_t op)
 {
-	std::atomic_thread_fence(std::memory_order_seq_cst);
+	atomic_fence_seq_cst();
 	return true;
 }

 // This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
 bool spu_interpreter::DSYNC(spu_thread& spu, spu_opcode_t op)
 {
-	std::atomic_thread_fence(std::memory_order_seq_cst);
+	atomic_fence_seq_cst();
 	return true;
 }

--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -2277,7 +2277,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 				}
 			}

-			//std::atomic_thread_fence(std::memory_order_seq_cst);
+			//atomic_fence_seq_cst();
 			return;
 		}
 		else
@ -2904,7 +2904,7 @@ void spu_thread::do_mfc(bool wait)
 			if (&args - mfc_queue <= removed)
 			{
 				// Remove barrier-class command if it's the first in the queue
-				std::atomic_thread_fence(std::memory_order_seq_cst);
+				atomic_fence_seq_cst();
 				removed++;
 				return true;
 			}
@ -3130,7 +3130,7 @@ bool spu_thread::process_mfc_cmd()
 				// Exit loop
 				if (ok && (ntime & 127) == 0)
 				{
-					_mm_mfence();
+					atomic_fence_seq_cst();
 					i = -1;
 					return;
 				}
@ -3352,7 +3352,7 @@ bool spu_thread::process_mfc_cmd()
 	{
 		if (mfc_size == 0)
 		{
-			std::atomic_thread_fence(std::memory_order_seq_cst);
+			atomic_fence_seq_cst();
 		}
 		else
 		{
@ -4420,7 +4420,7 @@ bool spu_thread::stop_and_signal(u32 code)
 			fmt::throw_exception("STOP code 0x100: Out_MBox is not empty" HERE);
 		}

-		std::atomic_thread_fence(std::memory_order_seq_cst);
+		atomic_fence_seq_cst();
 		return true;
 	}

--- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
+++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
@ -178,7 +178,7 @@ namespace rsx
 		{
 			// Load registers while the RSX is still idle
 			method_registers = frame->reg_state;
-			std::atomic_thread_fence(std::memory_order_seq_cst);
+			atomic_fence_seq_cst();

 			// start up fifo buffer by dumping the put ptr to first stop
 			sys_rsx_context_attribute(context_id, 0x001, 0x10000000, fifo_stops[0], 0, 0);
--- a/rpcs3/util/atomic.cpp
+++ b/rpcs3/util/atomic.cpp
@ -1146,7 +1146,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
 	std::unique_lock lock(*cond->mtx.get());
 #else
 	if (ext_size)
-		_mm_mfence();
+		atomic_fence_seq_cst();
 #endif

 	// Can skip unqueue process if true
--- a/rpcs3/util/atomic.hpp
+++ b/rpcs3/util/atomic.hpp
@ -5,7 +5,59 @@
 #include <mutex>

 #ifdef _MSC_VER
-#include <atomic>
+#pragma warning(push)
+#pragma warning(disable: 4996)
+#endif
+
+FORCE_INLINE void atomic_fence_consume()
+{
+#ifdef _MSC_VER
+	_ReadWriteBarrier();
+#else
+	__atomic_thread_fence(__ATOMIC_CONSUME);
+#endif
+}
+
+FORCE_INLINE void atomic_fence_acquire()
+{
+#ifdef _MSC_VER
+	_ReadWriteBarrier();
+#else
+	__atomic_thread_fence(__ATOMIC_ACQUIRE);
+#endif
+}
+
+FORCE_INLINE void atomic_fence_release()
+{
+#ifdef _MSC_VER
+	_ReadWriteBarrier();
+#else
+	__atomic_thread_fence(__ATOMIC_RELEASE);
+#endif
+}
+
+FORCE_INLINE void atomic_fence_acq_rel()
+{
+#ifdef _MSC_VER
+	_ReadWriteBarrier();
+#else
+	__atomic_thread_fence(__ATOMIC_ACQ_REL);
+#endif
+}
+
+FORCE_INLINE void atomic_fence_seq_cst()
+{
+#ifdef _MSC_VER
+	_ReadWriteBarrier();
+	_InterlockedOr(static_cast<long*>(_AddressOfReturnAddress()), 0);
+	_ReadWriteBarrier();
+#else
+	__asm__ volatile ("lock orl $0, 0(%%esp);" ::: "cc", "memory");
+#endif
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
 #endif

 // Wait timeout extension (in nanoseconds)
@ -286,6 +338,13 @@ struct atomic_storage
 		return result;
 	}

+	static inline T observe(const T& dest)
+	{
+		T result;
+		__atomic_load(reinterpret_cast<const type*>(&dest), reinterpret_cast<type*>(&result), __ATOMIC_RELAXED);
+		return result;
+	}
+
 	static inline void store(T& dest, T value)
 	{
 		static_cast<void>(exchange(dest, value));
@ -506,17 +565,23 @@ struct atomic_storage<T, 1> : atomic_storage<T, 0>

 	static inline T load(const T& dest)
 	{
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
+		const char value = *reinterpret_cast<const volatile char*>(&dest);
+		atomic_fence_acquire();
+		return std::bit_cast<T>(value);
+	}
+
+	static inline T observe(const T& dest)
+	{
 		const char value = *reinterpret_cast<const volatile char*>(&dest);
-		std::atomic_thread_fence(std::memory_order_acquire);
 		return std::bit_cast<T>(value);
 	}

 	static inline void release(T& dest, T value)
 	{
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 		*reinterpret_cast<volatile char*>(&dest) = std::bit_cast<char>(value);
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 	}

 	static inline T exchange(T& dest, T value)
@ -570,17 +635,23 @@ struct atomic_storage<T, 2> : atomic_storage<T, 0>

 	static inline T load(const T& dest)
 	{
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
+		const short value = *reinterpret_cast<const volatile short*>(&dest);
+		atomic_fence_acquire();
+		return std::bit_cast<T>(value);
+	}
+
+	static inline T observe(const T& dest)
+	{
 		const short value = *reinterpret_cast<const volatile short*>(&dest);
-		std::atomic_thread_fence(std::memory_order_acquire);
 		return std::bit_cast<T>(value);
 	}

 	static inline void release(T& dest, T value)
 	{
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 		*reinterpret_cast<volatile short*>(&dest) = std::bit_cast<short>(value);
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 	}

 	static inline T exchange(T& dest, T value)
@ -654,17 +725,23 @@ struct atomic_storage<T, 4> : atomic_storage<T, 0>

 	static inline T load(const T& dest)
 	{
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
+		const long value = *reinterpret_cast<const volatile long*>(&dest);
+		atomic_fence_acquire();
+		return std::bit_cast<T>(value);
+	}
+
+	static inline T observe(const T& dest)
+	{
 		const long value = *reinterpret_cast<const volatile long*>(&dest);
-		std::atomic_thread_fence(std::memory_order_acquire);
 		return std::bit_cast<T>(value);
 	}

 	static inline void release(T& dest, T value)
 	{
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 		*reinterpret_cast<volatile long*>(&dest) = std::bit_cast<long>(value);
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 	}

 	static inline T exchange(T& dest, T value)
@ -744,17 +821,23 @@ struct atomic_storage<T, 8> : atomic_storage<T, 0>

 	static inline T load(const T& dest)
 	{
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
+		const llong value = *reinterpret_cast<const volatile llong*>(&dest);
+		atomic_fence_acquire();
+		return std::bit_cast<T>(value);
+	}
+
+	static inline T observe(const T& dest)
+	{
 		const llong value = *reinterpret_cast<const volatile llong*>(&dest);
-		std::atomic_thread_fence(std::memory_order_acquire);
 		return std::bit_cast<T>(value);
 	}

 	static inline void release(T& dest, T value)
 	{
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 		*reinterpret_cast<volatile llong*>(&dest) = std::bit_cast<llong>(value);
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 	}

 	static inline T exchange(T& dest, T value)
@ -818,9 +901,18 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
 #ifdef _MSC_VER
 	static inline T load(const T& dest)
 	{
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
 		__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
-		std::atomic_thread_fence(std::memory_order_acquire);
+		atomic_fence_acquire();
+		return std::bit_cast<T>(val);
+	}
+
+	static inline T observe(const T& dest)
+	{
+		// Barriers are kept intentionally
+		atomic_fence_acquire();
+		__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
+		atomic_fence_acquire();
 		return std::bit_cast<T>(val);
 	}

@ -844,14 +936,16 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>

 	static inline void store(T& dest, T value)
 	{
-		exchange(dest, value);
+		atomic_fence_acq_rel();
+		_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
+		atomic_fence_seq_cst();
 	}

 	static inline void release(T& dest, T value)
 	{
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 		_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
-		std::atomic_thread_fence(std::memory_order_release);
+		atomic_fence_release();
 	}
 #else
 	static inline T load(const T& dest)
@ -862,6 +956,15 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
 		return std::bit_cast<T>(val);
 	}

+	static inline T observe(const T& dest)
+	{
+		// Barriers are kept intentionally
+		__atomic_thread_fence(__ATOMIC_ACQUIRE);
+		__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
+		__atomic_thread_fence(__ATOMIC_ACQUIRE);
+		return std::bit_cast<T>(val);
+	}
+
 	static inline bool compare_exchange(T& dest, T& comp, T exch)
 	{
 		bool result;
@ -915,7 +1018,9 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>

 	static inline void store(T& dest, T value)
 	{
-		exchange(dest, value);
+		__atomic_thread_fence(__ATOMIC_ACQ_REL);
+		_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
+		atomic_fence_seq_cst();
 	}

 	static inline void release(T& dest, T value)
@ -1075,6 +1180,12 @@ public:
 		return atomic_storage<type>::load(m_data);
 	}

+	// Relaxed load
+	type observe() const
+	{
+		return atomic_storage<type>::observe(m_data);
+	}
+
 	// Atomically write data
 	void store(const type& rhs)
 	{
--- a/rpcs3/util/logs.cpp
+++ b/rpcs3/util/logs.cpp
@ -168,7 +168,7 @@ namespace logs

 		for (auto&& pair : get_logger()->channels)
 		{
-			pair.second->enabled.store(level::notice, std::memory_order_relaxed);
+			pair.second->enabled.release(level::notice);
 		}
 	}

@ -178,7 +178,7 @@ namespace logs

 		for (auto&& pair : get_logger()->channels)
 		{
-			pair.second->enabled.store(level::always, std::memory_order_relaxed);
+			pair.second->enabled.release(level::always);
 		}
 	}

@ -190,7 +190,7 @@ namespace logs

 		while (found.first != found.second)
 		{
-			found.first->second->enabled.store(value, std::memory_order_relaxed);
+			found.first->second->enabled.release(value);
 			found.first++;
 		}
 	}
@ -203,7 +203,7 @@ namespace logs

 		if (found.first != found.second)
 		{
-			return found.first->second->enabled.load(std::memory_order_relaxed);
+			return found.first->second->enabled.observe();
 		}
 		else
 		{
@ -275,7 +275,7 @@ logs::listener::~listener()

 		for (auto&& pair : logger->channels)
 		{
-			pair.second->enabled.store(level::always, std::memory_order_relaxed);
+			pair.second->enabled.release(level::always);
 		}
 	}
 }
@ -290,7 +290,7 @@ void logs::listener::add(logs::listener* _new)
 	// Install new listener at the end of linked list
 	listener* null = nullptr;

-	while (lis->m_next || !lis->m_next.compare_exchange_strong(null, _new))
+	while (lis->m_next || !lis->m_next.compare_exchange(null, _new))
 	{
 		lis = lis->m_next;
 		null = nullptr;
--- a/rpcs3/util/logs.hpp
+++ b/rpcs3/util/logs.hpp
@ -1,12 +1,12 @@
 #pragma once // No BOM and only basic ASCII in this header, or a neko will die

 #include <cstdint>
-#include <atomic>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 #include <initializer_list>
+#include "util/atomic.hpp"
 #include "Utilities/StrFmt.h"

 namespace logs
@ -51,7 +51,7 @@ namespace logs
 	class listener
 	{
 		// Next listener (linked list)
-		std::atomic<listener*> m_next{};
+		atomic_t<listener*> m_next{};

 		friend struct message;

@ -76,7 +76,7 @@ namespace logs
 		const char* const name;

 		// The lowest logging level enabled for this channel (used for early filtering)
-		std::atomic<level> enabled;
+		atomic_t<level> enabled;

 		// Initialize channel
 		constexpr channel(const char* name) noexcept
@ -90,7 +90,7 @@ namespace logs
 		template <typename CharT, std::size_t N, typename... Args>\
 		void _sev(const CharT(&fmt)[N], const Args&... args)\
 		{\
-			if (level::_sev <= enabled.load(std::memory_order_relaxed)) [[unlikely]]\
+			if (level::_sev <= enabled.observe()) [[unlikely]]\
 			{\
 				if constexpr (sizeof...(Args) > 0)\
 				{\