atomic.hpp: add some features and optimizations

Add atomic_t<>::observe() (relaxed load)
Add atomic_fence_XXX() (barrier functions)
Get rid of MFENCE instruction, replace with no-op LOCK OR on stack.
Remove <atomic> dependence from stdafx.h and relevant headers.
This commit is contained in:
Nekotekina 2020-12-06 12:10:00 +03:00
parent 77aa9e58f2
commit b16cc618b5
14 changed files with 171 additions and 60 deletions

View File

@ -3029,7 +3029,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
struct alignas(128) wklFlagOp
{

View File

@ -85,7 +85,7 @@ error_code cellSyncMutexLock(ppu_thread& ppu, vm::ptr<CellSyncMutex> mutex)
}
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
return CELL_OK;
}
@ -194,7 +194,7 @@ error_code cellSyncBarrierTryNotify(vm::ptr<CellSyncBarrier> barrier)
return CELL_SYNC_ERROR_ALIGN;
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_notify))
{
@ -218,7 +218,7 @@ error_code cellSyncBarrierWait(ppu_thread& ppu, vm::ptr<CellSyncBarrier> barrier
return CELL_SYNC_ERROR_ALIGN;
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
while (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait))
{
@ -245,7 +245,7 @@ error_code cellSyncBarrierTryWait(vm::ptr<CellSyncBarrier> barrier)
return CELL_SYNC_ERROR_ALIGN;
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
if (!barrier->ctrl.atomic_op(&CellSyncBarrier::try_wait))
{
@ -279,7 +279,7 @@ error_code cellSyncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer,
rwm->size = buffer_size;
rwm->buffer = buffer;
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
return CELL_OK;
}
@ -451,7 +451,7 @@ error_code cellSyncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buf
queue->depth = depth;
queue->buffer = buffer;
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
return CELL_OK;
}
@ -863,7 +863,7 @@ error_code cellSyncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::cptr<vo
}
}
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
}
else
{

View File

@ -128,7 +128,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64
// recursive locking succeeded
lwmutex->recursive_count++;
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
return CELL_OK;
}
@ -288,7 +288,7 @@ error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex)
// recursive locking succeeded
lwmutex->recursive_count++;
std::atomic_thread_fence(std::memory_order_acq_rel);
atomic_fence_acq_rel();
return CELL_OK;
}

View File

@ -3156,7 +3156,7 @@ bool ppu_interpreter::CRANDC(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::ISYNC(ppu_thread& ppu, ppu_opcode_t op)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
return true;
}
@ -4222,7 +4222,7 @@ bool ppu_interpreter::LFSUX(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::SYNC(ppu_thread& ppu, ppu_opcode_t op)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
return true;
}
@ -4432,7 +4432,7 @@ bool ppu_interpreter::SRADI(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::EIEIO(ppu_thread& ppu, ppu_opcode_t op)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
return true;
}

View File

@ -1208,7 +1208,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
perf_meter<"LARX"_u32> perf0;
// Do not allow stores accessed from the same cache line to past reservation load
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
if (addr % sizeof(T))
{
@ -1322,7 +1322,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
else
{
mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
// Load relevant 64 bits of reservation data
std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8);

View File

@ -105,7 +105,7 @@ bool spu_thread::read_reg(const u32 addr, u32& value)
case MFC_EIEIO_CMD:
case MFC_SYNC_CMD:
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
value = MFC_PPU_DMA_CMD_ENQUEUE_SUCCESSFUL;
return true;
}

View File

@ -1310,7 +1310,7 @@ void spu_recompiler::LNOP(spu_opcode_t op)
void spu_recompiler::SYNC(spu_opcode_t op)
{
// This instruction must be used following a store instruction that modifies the instruction stream.
c->mfence();
c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0);
if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
@ -1325,7 +1325,7 @@ void spu_recompiler::SYNC(spu_opcode_t op)
void spu_recompiler::DSYNC(spu_opcode_t op)
{
// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
c->mfence();
c->lock().or_(asmjit::x86::dword_ptr(asmjit::x86::esp), 0);
}
void spu_recompiler::MFSPR(spu_opcode_t op)

View File

@ -141,14 +141,14 @@ bool spu_interpreter::LNOP(spu_thread& spu, spu_opcode_t op)
// This instruction must be used following a store instruction that modifies the instruction stream.
bool spu_interpreter::SYNC(spu_thread& spu, spu_opcode_t op)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
return true;
}
// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
bool spu_interpreter::DSYNC(spu_thread& spu, spu_opcode_t op)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
return true;
}

View File

@ -2277,7 +2277,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
}
}
//std::atomic_thread_fence(std::memory_order_seq_cst);
//atomic_fence_seq_cst();
return;
}
else
@ -2904,7 +2904,7 @@ void spu_thread::do_mfc(bool wait)
if (&args - mfc_queue <= removed)
{
// Remove barrier-class command if it's the first in the queue
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
removed++;
return true;
}
@ -3130,7 +3130,7 @@ bool spu_thread::process_mfc_cmd()
// Exit loop
if (ok && (ntime & 127) == 0)
{
_mm_mfence();
atomic_fence_seq_cst();
i = -1;
return;
}
@ -3352,7 +3352,7 @@ bool spu_thread::process_mfc_cmd()
{
if (mfc_size == 0)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
}
else
{
@ -4420,7 +4420,7 @@ bool spu_thread::stop_and_signal(u32 code)
fmt::throw_exception("STOP code 0x100: Out_MBox is not empty" HERE);
}
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
return true;
}

View File

@ -178,7 +178,7 @@ namespace rsx
{
// Load registers while the RSX is still idle
method_registers = frame->reg_state;
std::atomic_thread_fence(std::memory_order_seq_cst);
atomic_fence_seq_cst();
// start up fifo buffer by dumping the put ptr to first stop
sys_rsx_context_attribute(context_id, 0x001, 0x10000000, fifo_stops[0], 0, 0);

View File

@ -1146,7 +1146,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
std::unique_lock lock(*cond->mtx.get());
#else
if (ext_size)
_mm_mfence();
atomic_fence_seq_cst();
#endif
// Can skip unqueue process if true

View File

@ -5,7 +5,59 @@
#include <mutex>
#ifdef _MSC_VER
#include <atomic>
#pragma warning(push)
#pragma warning(disable: 4996)
#endif
FORCE_INLINE void atomic_fence_consume()
{
#ifdef _MSC_VER
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_CONSUME);
#endif
}
FORCE_INLINE void atomic_fence_acquire()
{
#ifdef _MSC_VER
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_ACQUIRE);
#endif
}
FORCE_INLINE void atomic_fence_release()
{
#ifdef _MSC_VER
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_RELEASE);
#endif
}
FORCE_INLINE void atomic_fence_acq_rel()
{
#ifdef _MSC_VER
_ReadWriteBarrier();
#else
__atomic_thread_fence(__ATOMIC_ACQ_REL);
#endif
}
FORCE_INLINE void atomic_fence_seq_cst()
{
#ifdef _MSC_VER
_ReadWriteBarrier();
_InterlockedOr(static_cast<long*>(_AddressOfReturnAddress()), 0);
_ReadWriteBarrier();
#else
__asm__ volatile ("lock orl $0, 0(%%esp);" ::: "cc", "memory");
#endif
}
#ifdef _MSC_VER
#pragma warning(pop)
#endif
// Wait timeout extension (in nanoseconds)
@ -286,6 +338,13 @@ struct atomic_storage
return result;
}
static inline T observe(const T& dest)
{
T result;
__atomic_load(reinterpret_cast<const type*>(&dest), reinterpret_cast<type*>(&result), __ATOMIC_RELAXED);
return result;
}
static inline void store(T& dest, T value)
{
static_cast<void>(exchange(dest, value));
@ -506,17 +565,23 @@ struct atomic_storage<T, 1> : atomic_storage<T, 0>
static inline T load(const T& dest)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
const char value = *reinterpret_cast<const volatile char*>(&dest);
atomic_fence_acquire();
return std::bit_cast<T>(value);
}
static inline T observe(const T& dest)
{
const char value = *reinterpret_cast<const volatile char*>(&dest);
std::atomic_thread_fence(std::memory_order_acquire);
return std::bit_cast<T>(value);
}
static inline void release(T& dest, T value)
{
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
*reinterpret_cast<volatile char*>(&dest) = std::bit_cast<char>(value);
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
}
static inline T exchange(T& dest, T value)
@ -570,17 +635,23 @@ struct atomic_storage<T, 2> : atomic_storage<T, 0>
static inline T load(const T& dest)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
const short value = *reinterpret_cast<const volatile short*>(&dest);
atomic_fence_acquire();
return std::bit_cast<T>(value);
}
static inline T observe(const T& dest)
{
const short value = *reinterpret_cast<const volatile short*>(&dest);
std::atomic_thread_fence(std::memory_order_acquire);
return std::bit_cast<T>(value);
}
static inline void release(T& dest, T value)
{
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
*reinterpret_cast<volatile short*>(&dest) = std::bit_cast<short>(value);
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
}
static inline T exchange(T& dest, T value)
@ -654,17 +725,23 @@ struct atomic_storage<T, 4> : atomic_storage<T, 0>
static inline T load(const T& dest)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
const long value = *reinterpret_cast<const volatile long*>(&dest);
atomic_fence_acquire();
return std::bit_cast<T>(value);
}
static inline T observe(const T& dest)
{
const long value = *reinterpret_cast<const volatile long*>(&dest);
std::atomic_thread_fence(std::memory_order_acquire);
return std::bit_cast<T>(value);
}
static inline void release(T& dest, T value)
{
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
*reinterpret_cast<volatile long*>(&dest) = std::bit_cast<long>(value);
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
}
static inline T exchange(T& dest, T value)
@ -744,17 +821,23 @@ struct atomic_storage<T, 8> : atomic_storage<T, 0>
static inline T load(const T& dest)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
const llong value = *reinterpret_cast<const volatile llong*>(&dest);
atomic_fence_acquire();
return std::bit_cast<T>(value);
}
static inline T observe(const T& dest)
{
const llong value = *reinterpret_cast<const volatile llong*>(&dest);
std::atomic_thread_fence(std::memory_order_acquire);
return std::bit_cast<T>(value);
}
static inline void release(T& dest, T value)
{
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
*reinterpret_cast<volatile llong*>(&dest) = std::bit_cast<llong>(value);
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
}
static inline T exchange(T& dest, T value)
@ -818,9 +901,18 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
#ifdef _MSC_VER
static inline T load(const T& dest)
{
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
std::atomic_thread_fence(std::memory_order_acquire);
atomic_fence_acquire();
return std::bit_cast<T>(val);
}
static inline T observe(const T& dest)
{
// Barriers are kept intentionally
atomic_fence_acquire();
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
atomic_fence_acquire();
return std::bit_cast<T>(val);
}
@ -844,14 +936,16 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline void store(T& dest, T value)
{
exchange(dest, value);
atomic_fence_acq_rel();
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
atomic_fence_seq_cst();
}
static inline void release(T& dest, T value)
{
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
std::atomic_thread_fence(std::memory_order_release);
atomic_fence_release();
}
#else
static inline T load(const T& dest)
@ -862,6 +956,15 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
return std::bit_cast<T>(val);
}
static inline T observe(const T& dest)
{
// Barriers are kept intentionally
__atomic_thread_fence(__ATOMIC_ACQUIRE);
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
__atomic_thread_fence(__ATOMIC_ACQUIRE);
return std::bit_cast<T>(val);
}
static inline bool compare_exchange(T& dest, T& comp, T exch)
{
bool result;
@ -915,7 +1018,9 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline void store(T& dest, T value)
{
exchange(dest, value);
__atomic_thread_fence(__ATOMIC_ACQ_REL);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
atomic_fence_seq_cst();
}
static inline void release(T& dest, T value)
@ -1075,6 +1180,12 @@ public:
return atomic_storage<type>::load(m_data);
}
// Relaxed load
type observe() const
{
return atomic_storage<type>::observe(m_data);
}
// Atomically write data
void store(const type& rhs)
{

View File

@ -168,7 +168,7 @@ namespace logs
for (auto&& pair : get_logger()->channels)
{
pair.second->enabled.store(level::notice, std::memory_order_relaxed);
pair.second->enabled.release(level::notice);
}
}
@ -178,7 +178,7 @@ namespace logs
for (auto&& pair : get_logger()->channels)
{
pair.second->enabled.store(level::always, std::memory_order_relaxed);
pair.second->enabled.release(level::always);
}
}
@ -190,7 +190,7 @@ namespace logs
while (found.first != found.second)
{
found.first->second->enabled.store(value, std::memory_order_relaxed);
found.first->second->enabled.release(value);
found.first++;
}
}
@ -203,7 +203,7 @@ namespace logs
if (found.first != found.second)
{
return found.first->second->enabled.load(std::memory_order_relaxed);
return found.first->second->enabled.observe();
}
else
{
@ -275,7 +275,7 @@ logs::listener::~listener()
for (auto&& pair : logger->channels)
{
pair.second->enabled.store(level::always, std::memory_order_relaxed);
pair.second->enabled.release(level::always);
}
}
}
@ -290,7 +290,7 @@ void logs::listener::add(logs::listener* _new)
// Install new listener at the end of linked list
listener* null = nullptr;
while (lis->m_next || !lis->m_next.compare_exchange_strong(null, _new))
while (lis->m_next || !lis->m_next.compare_exchange(null, _new))
{
lis = lis->m_next;
null = nullptr;

View File

@ -1,12 +1,12 @@
#pragma once // No BOM and only basic ASCII in this header, or a neko will die
#include <cstdint>
#include <atomic>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include <initializer_list>
#include "util/atomic.hpp"
#include "Utilities/StrFmt.h"
namespace logs
@ -51,7 +51,7 @@ namespace logs
class listener
{
// Next listener (linked list)
std::atomic<listener*> m_next{};
atomic_t<listener*> m_next{};
friend struct message;
@ -76,7 +76,7 @@ namespace logs
const char* const name;
// The lowest logging level enabled for this channel (used for early filtering)
std::atomic<level> enabled;
atomic_t<level> enabled;
// Initialize channel
constexpr channel(const char* name) noexcept
@ -90,7 +90,7 @@ namespace logs
template <typename CharT, std::size_t N, typename... Args>\
void _sev(const CharT(&fmt)[N], const Args&... args)\
{\
if (level::_sev <= enabled.load(std::memory_order_relaxed)) [[unlikely]]\
if (level::_sev <= enabled.observe()) [[unlikely]]\
{\
if constexpr (sizeof...(Args) > 0)\
{\