Fixup after #6143 (#6146)

vm::spu max address was overflowing resulting in issues, so cast to u64 where needed. Fixes #6145. Use vm::get_addr instead of manually substructing vm::base(0) from pointer in texture cache code. Prefer std::atomic_thread_fence over _mm_?fence(), adjust usage to be more correct. Used sequantially consistent ordering in semaphore_release for TSX path as well. Improved memory ordering for sys_rsx_context_iounmap/map. Fixed sync bugs in HLE gcm because of not using atomic instructions. Use release memory barrier in lwsync for PPU LLVM, according to this xbox360 programming guide lwsync is a hw release memory barrier. Also use release barrier where lwsync was originally used in liblv2 sys_lwmutex and cellSync. Use acquire barrier for isync instruction, see https://devblogs.microsoft.com/oldnewthing/20180814-00/?p=99485
2025-03-11 19:14:54 +00:00 · 2019-06-29 18:48:42 +03:00 · 2019-06-29 18:48:42 +03:00 · 43f919c04b
commit 43f919c04b
parent 1ee7b91646
20 changed files with 85 additions and 65 deletions
--- a/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp
@ -12,6 +12,7 @@
 #include "sysPrxForUser.h"

 #include <thread>
+#include <atomic>

 LOG_CHANNEL(cellGcmSys);

@ -40,10 +41,11 @@ struct CellGcmSysConfig {
 };

 u64 system_mode = 0;
-u32 reserved_size = 0;
 u32 local_size = 0;
 u32 local_addr = 0;

+atomic_t<u32> reserved_size = 0;
+
 // Auxiliary functions

 /*
@ -72,7 +74,7 @@ u32 gcmGetLocalMemorySize(u32 sdk_version)
 }

 CellGcmOffsetTable offsetTable;
-u16 IoMapTable[0xC00];
+atomic_t<u16> IoMapTable[0xC00]{};

 void InitOffsetTable()
 {
@ -1013,8 +1015,6 @@ s32 gcmMapEaIoAddress(u32 ea, u32 io, u32 size, bool is_strict)

 	ea >>= 20, io >>= 20, size >>= 20;

-	IoMapTable[ea] = size;
-
 	// Fill the offset table
 	for (u32 i = 0; i < size; i++)
 	{
@ -1022,6 +1022,7 @@ s32 gcmMapEaIoAddress(u32 ea, u32 io, u32 size, bool is_strict)
 		offsetTable.eaAddress[io + i] = ea + i;
 	}

+	IoMapTable[ea] = size;
 	return CELL_OK;
 }

@ -1079,8 +1080,6 @@ s32 cellGcmMapMainMemory(u32 ea, u32 size, vm::ptr<u32> offset)

 				ea >>= 20, size >>= 20;

-				IoMapTable[ea] = size;
-
 				// Fill the offset table
 				for (u32 i = 0; i < size; i++)
 				{
@ -1088,6 +1087,8 @@ s32 cellGcmMapMainMemory(u32 ea, u32 size, vm::ptr<u32> offset)
 					offsetTable.eaAddress[io + i] = ea + i;
 				}

+				IoMapTable[ea] = size;
+
 				*offset = io << 20;
 				return CELL_OK;
 			}
@ -1127,15 +1128,17 @@ s32 cellGcmUnmapEaIoAddress(u32 ea)
 {
 	cellGcmSys.trace("cellGcmUnmapEaIoAddress(ea=0x%x)", ea);

-	if (const u32 size = std::exchange(IoMapTable[ea >>= 20], 0))
+	if (const u32 size = IoMapTable[ea >>= 20].exchange(0))
 	{
 		const u32 io = offsetTable.ioAddress[ea];

 		for (u32 i = 0; i < size; i++)
 		{
-			RSXIOMem.io[ea + i].release(offsetTable.ioAddress[ea + i] = 0xFFFF);
-			RSXIOMem.ea[io + i].release(offsetTable.eaAddress[io + i] = 0xFFFF);
+			RSXIOMem.io[ea + i].raw() = offsetTable.ioAddress[ea + i] = 0xFFFF;
+			RSXIOMem.ea[io + i].raw() = offsetTable.eaAddress[io + i] = 0xFFFF;
 		}
+
+		std::atomic_thread_fence(std::memory_order_seq_cst);
 	}
 	else
 	{
@ -1150,15 +1153,17 @@ s32 cellGcmUnmapIoAddress(u32 io)
 {
 	cellGcmSys.trace("cellGcmUnmapIoAddress(io=0x%x)", io);

-	if (u32 size = std::exchange(IoMapTable[RSXIOMem.ea[io >>= 20]], 0))
+	if (u32 size = IoMapTable[RSXIOMem.ea[io >>= 20]].exchange(0))
 	{
 		const u32 ea = offsetTable.eaAddress[io];

 		for (u32 i = 0; i < size; i++)
 		{
-			RSXIOMem.io[ea + i].release(offsetTable.ioAddress[ea + i] = 0xFFFF);
-			RSXIOMem.ea[io + i].release(offsetTable.eaAddress[io + i] = 0xFFFF);
+			RSXIOMem.io[ea + i].raw() = offsetTable.ioAddress[ea + i] = 0xFFFF;
+			RSXIOMem.ea[io + i].raw() = offsetTable.eaAddress[io + i] = 0xFFFF;
 		}
+
+		std::atomic_thread_fence(std::memory_order_seq_cst);
 	}
 	else
 	{
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@ -15,6 +15,8 @@
 #include "sysPrxForUser.h"
 #include "cellSpurs.h"

+#include <atomic>
+
 LOG_CHANNEL(cellSpurs);

 error_code sys_spu_image_close(vm::ptr<sys_spu_image> img);
@ -2575,7 +2577,7 @@ s32 _cellSpursWorkloadFlagReceiver(vm::ptr<CellSpurs> spurs, u32 wid, u32 is_set
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_seq_cst);

 	if (s32 res = spurs->wklFlag.flag.atomic_op([spurs, wid, is_set](be_t<u32>& flag) -> s32
 	{
--- a/rpcs3/Emu/Cell/Modules/cellSync.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSync.cpp
@ -6,6 +6,8 @@
 #include "Emu/Cell/lv2/sys_process.h"
 #include "cellSync.h"

+#include <atomic>
+
 LOG_CHANNEL(cellSync);

 template<>
@ -85,8 +87,7 @@ error_code cellSyncMutexLock(ppu_thread& ppu, vm::ptr<CellSyncMutex> mutex)
 		}
 	}

-	_mm_mfence();
-
+	std::atomic_thread_fence(std::memory_order_release);
 	return CELL_OK;
 }

@ -195,7 +196,7 @@ error_code cellSyncBarrierTryNotify(vm::ptr<CellSyncBarrier> barrier)
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_release);

 	if (!barrier->ctrl.atomic_op<&CellSyncBarrier::try_notify>())
 	{
@ -219,7 +220,7 @@ error_code cellSyncBarrierWait(ppu_thread& ppu, vm::ptr<CellSyncBarrier> barrier
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_release);

 	while (!barrier->ctrl.atomic_op<&CellSyncBarrier::try_wait>())
 	{
@ -246,7 +247,7 @@ error_code cellSyncBarrierTryWait(vm::ptr<CellSyncBarrier> barrier)
 		return CELL_SYNC_ERROR_ALIGN;
 	}

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_release);

 	if (!barrier->ctrl.atomic_op<&CellSyncBarrier::try_wait>())
 	{
@ -280,7 +281,7 @@ error_code cellSyncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer,
 	rwm->size = buffer_size;
 	rwm->buffer = buffer;

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_release);

 	return CELL_OK;
 }
@ -452,7 +453,7 @@ error_code cellSyncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buf
 	queue->depth = depth;
 	queue->buffer = buffer;

-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_release);

 	return CELL_OK;
 }
@ -865,7 +866,7 @@ error_code cellSyncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::cptr<vo
 			}
 		}

-		_mm_mfence();
+		std::atomic_thread_fence(std::memory_order_release);
 	}
 	else
 	{
@ -894,7 +895,7 @@ error_code _cellSyncLFQueueGetPushPointer(ppu_thread& ppu, vm::ptr<CellSyncLFQue
 	{
 		while (true)
 		{
-			const auto old = queue->push1.load(); _mm_lfence();
+			const auto old = queue->push1.load();
 			auto push = old;

 			if (var1)
@ -989,9 +990,10 @@ error_code _cellSyncLFQueueCompletePushPointer(ppu_thread& ppu, vm::ptr<CellSync

 	while (true)
 	{
-		const auto old = queue->push2.load(); _mm_lfence();
+		const auto old = queue->push2.load();
 		auto push2 = old;

+		// Loads must be in this order
 		const auto old2 = queue->push3.load();
 		auto push3 = old2;

@ -1192,7 +1194,7 @@ error_code _cellSyncLFQueueGetPopPointer(ppu_thread& ppu, vm::ptr<CellSyncLFQueu
 	{
 		while (true)
 		{
-			const auto old = queue->pop1.load(); _mm_lfence();
+			const auto old = queue->pop1.load();
 			auto pop = old;

 			if (var1)
@ -1288,9 +1290,10 @@ error_code _cellSyncLFQueueCompletePopPointer(ppu_thread& ppu, vm::ptr<CellSyncL

 	while (true)
 	{
-		const auto old = queue->pop2.load(); _mm_lfence();
+		const auto old = queue->pop2.load();
 		auto pop2 = old;

+		// Loads must be in this order
 		const auto old2 = queue->pop3.load();
 		auto pop3 = old2;

@ -1489,9 +1492,10 @@ error_code cellSyncLFQueueClear(vm::ptr<CellSyncLFQueue> queue)

 	while (true)
 	{
-		const auto old = queue->pop1.load(); _mm_lfence();
+		const auto old = queue->pop1.load();
 		auto pop = old;

+		// Loads must be in this order
 		const auto push = queue->push1.load();

 		s32 var1, var2;
@ -1540,8 +1544,9 @@ error_code cellSyncLFQueueSize(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u32> size

 	while (true)
 	{
-		const auto old = queue->pop3.load(); _mm_lfence();
+		const auto old = queue->pop3.load();

+		// Loads must be in this order
 		u32 var1 = (u16)queue->pop1.load().m_h1;
 		u32 var2 = (u16)queue->push1.load().m_h5;

--- a/rpcs3/Emu/Cell/Modules/sys_lwmutex_.cpp
+++ b/rpcs3/Emu/Cell/Modules/sys_lwmutex_.cpp
@ -8,6 +8,8 @@
 #include "Emu/Cell/lv2/sys_mutex.h"
 #include "sysPrxForUser.h"

+#include <atomic>
+
 extern logs::channel sysPrxForUser;

 error_code sys_lwmutex_create(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, vm::ptr<sys_lwmutex_attribute_t> attr)
@ -128,7 +130,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64

 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		_mm_mfence();
+		std::atomic_thread_fence(std::memory_order_release);

 		return CELL_OK;
 	}
@ -288,7 +290,7 @@ error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex)

 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		_mm_mfence();
+		std::atomic_thread_fence(std::memory_order_release);

 		return CELL_OK;
 	}
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -8,6 +8,7 @@
 #include "Emu/Cell/Common.h"

 #include <cmath>
+#include <atomic>

 #if !defined(_MSC_VER) && !defined(__SSSE3__)
 #define _mm_shuffle_epi8(opa, opb) opb
@ -2966,7 +2967,7 @@ bool ppu_interpreter::CRANDC(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::ISYNC(ppu_thread& ppu, ppu_opcode_t op)
 {
-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_acquire);
 	return true;
 }

@ -4046,7 +4047,7 @@ bool ppu_interpreter::LFSUX(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::SYNC(ppu_thread& ppu, ppu_opcode_t op)
 {
-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_seq_cst);
 	return true;
 }

@ -4280,7 +4281,7 @@ bool ppu_interpreter::SRADI(ppu_thread& ppu, ppu_opcode_t op)

 bool ppu_interpreter::EIEIO(ppu_thread& ppu, ppu_opcode_t op)
 {
-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_seq_cst);
 	return true;
 }

--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -1848,7 +1848,7 @@ void PPUTranslator::CRANDC(ppu_opcode_t op)

 void PPUTranslator::ISYNC(ppu_opcode_t op)
 {
-	m_ir->CreateFence(AtomicOrdering::SequentiallyConsistent);
+	m_ir->CreateFence(AtomicOrdering::Acquire);
 }

 void PPUTranslator::CRXOR(ppu_opcode_t op)
@ -3105,7 +3105,9 @@ void PPUTranslator::LFSUX(ppu_opcode_t op)

 void PPUTranslator::SYNC(ppu_opcode_t op)
 {
-	m_ir->CreateFence(AtomicOrdering::SequentiallyConsistent);
+	// sync: Full seq cst barrier
+	// lwsync: Release barrier
+	m_ir->CreateFence(op.l10 ? AtomicOrdering::Release : AtomicOrdering::SequentiallyConsistent);
 }

 void PPUTranslator::LFDX(ppu_opcode_t op)
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -6,6 +6,8 @@

 #include "Emu/Cell/RawSPUThread.h"

+#include <atomic>
+
 // Originally, SPU MFC registers are accessed externally in a concurrent manner (don't mix with channels, SPU MFC channels are isolated)
 thread_local spu_mfc_cmd g_tls_mfc[8] = {};

@ -173,7 +175,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
 		case MFC_SYNC_CMD:
 		{
 			g_tls_mfc[index] = {};
-			_mm_mfence();
+			std::atomic_thread_fence(std::memory_order_seq_cst);
 			return true;
 		}
 		}
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -140,14 +140,14 @@ bool spu_interpreter::LNOP(spu_thread& spu, spu_opcode_t op)
 // This instruction must be used following a store instruction that modifies the instruction stream.
 bool spu_interpreter::SYNC(spu_thread& spu, spu_opcode_t op)
 {
-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_seq_cst);
 	return true;
 }

 // This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
 bool spu_interpreter::DSYNC(spu_thread& spu, spu_opcode_t op)
 {
-	_mm_mfence();
+	std::atomic_thread_fence(std::memory_order_seq_cst);
 	return true;
 }

--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1662,7 +1662,7 @@ void spu_thread::do_mfc(bool wait)
 			if (&args - mfc_queue <= removed)
 			{
 				// Remove barrier-class command if it's the first in the queue
-				_mm_mfence();
+				std::atomic_thread_fence(std::memory_order_seq_cst);
 				removed++;
 				return true;
 			}
@ -2086,7 +2086,7 @@ bool spu_thread::process_mfc_cmd()
 	{
 		if (mfc_size == 0)
 		{
-			_mm_mfence();
+			std::atomic_thread_fence(std::memory_order_seq_cst);
 		}
 		else
 		{
@ -3025,12 +3025,13 @@ bool spu_thread::stop_and_signal(u32 code)

 	case 0x100:
 	{
+		// SPU thread group yield (TODO)
 		if (ch_out_mbox.get_count())
 		{
 			fmt::throw_exception("STOP code 0x100: Out_MBox is not empty" HERE);
 		}

-		_mm_mfence();
+		std::atomic_thread_fence(std::memory_order_seq_cst);
 		return true;
 	}

--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -284,7 +284,7 @@ public:
 	// push unconditionally (overwriting latest value), returns true if needs signaling
 	void push(cpu_thread& spu, u32 value)
 	{
-		value3 = value; _mm_sfence();
+		value3.store(value);

 		if (values.atomic_op([=](sync_var_t& data) -> bool
 		{
@ -325,7 +325,6 @@ public:

 				data.value0 = data.value1;
 				data.value1 = data.value2;
-				_mm_lfence();
 				data.value2 = this->value3;
 			}
 			else
--- a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp
@ -1,6 +1,7 @@
 #include "stdafx.h"
 #include "sys_rsx.h"

+#include <atomic>
 #include "Emu/System.h"
 #include "Emu/Cell/PPUModule.h"
 #include "Emu/RSX/GSRender.h"
@ -175,7 +176,7 @@ error_code sys_rsx_context_iomap(u32 context_id, u32 io, u32 ea, u32 size, u64 f
 {
 	sys_rsx.warning("sys_rsx_context_iomap(context_id=0x%x, io=0x%x, ea=0x%x, size=0x%x, flags=0x%llx)", context_id, io, ea, size, flags);

-	if (!size || io & 0xFFFFF || ea + u64{size} >= rsx::constants::local_mem_base || ea & 0xFFFFF || size & 0xFFFFF ||
+	if (!size || io & 0xFFFFF || ea + u64{size} > rsx::constants::local_mem_base || ea & 0xFFFFF || size & 0xFFFFF ||
 		rsx::get_current_renderer()->main_mem_size < io + u64{size})
 	{
 		return CELL_EINVAL;
@ -195,8 +196,8 @@ error_code sys_rsx_context_iomap(u32 context_id, u32 io, u32 ea, u32 size, u64 f

 	for (u32 i = 0; i < size; i++)
 	{
-		RSXIOMem.io[ea + i].release(io + i);
-		RSXIOMem.ea[io + i].release(ea + i);
+		RSXIOMem.io[ea + i].raw() = io + i;
+		RSXIOMem.ea[io + i].raw() = ea + i;
 	}

 	return CELL_OK;
@ -220,10 +221,11 @@ error_code sys_rsx_context_iounmap(u32 context_id, u32 io, u32 size)
 	const u32 end = (io >>= 20) + (size >>= 20);
 	for (u32 ea = RSXIOMem.ea[io]; io < end;)
 	{
-		RSXIOMem.io[ea++].release(0xFFFF);
-		RSXIOMem.ea[io++].release(0xFFFF);
+		RSXIOMem.io[ea++].raw() = 0xFFFF;
+		RSXIOMem.ea[io++].raw() = 0xFFFF;
 	}

+	std::atomic_thread_fence(std::memory_order_seq_cst);
 	return CELL_OK;
 }

--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -750,7 +750,7 @@ namespace vm
 		const u32 size = ::align(orig_size, min_page_size);

 		// return if addr or size is invalid
-		if (!size || addr < this->addr || addr + u64{size} > this->addr + this->size || flags & 0x10)
+		if (!size || addr < this->addr || addr + u64{size} > this->addr + u64{this->size} || flags & 0x10)
 		{
 			return 0;
 		}
@ -823,7 +823,7 @@ namespace vm

 	std::pair<u32, std::shared_ptr<utils::shm>> block_t::get(u32 addr, u32 size)
 	{
-		if (addr < this->addr || addr + u64{size} > this->addr + this->size)
+		if (addr < this->addr || addr + u64{size} > this->addr + u64{this->size})
 		{
 			return {addr, nullptr};
 		}
@ -852,7 +852,7 @@ namespace vm
 		}

 		// Range check
-		if (std::max<u32>(size, addr - found->first + size) > found->second.second->size())
+		if (addr + u64{size} > found->first + u64{found->second.second->size()})
 		{
 			return {addr, nullptr};
 		}
--- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
+++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
@ -8,6 +8,7 @@
 #include "Emu/RSX/GSRender.h"

 #include <map>
+#include <atomic>
 #include <exception>

 namespace rsx
@ -179,7 +180,7 @@ namespace rsx
 		{
 			// Load registers while the RSX is still idle
 			method_registers = frame->reg_state;
-			_mm_mfence();
+			std::atomic_thread_fence(std::memory_order_seq_cst);

 			// start up fifo buffer by dumping the put ptr to first stop
 			sys_rsx_context_attribute(context_id, 0x001, 0x10000000, fifo_stops[0], 0, 0);
--- a/rpcs3/Emu/RSX/Common/texture_cache.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache.h
@ -2279,8 +2279,8 @@ namespace rsx
 			image_resource_type vram_texture = 0;
 			image_resource_type dest_texture = 0;

-			const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0));
-			u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0));
+			const u32 dst_address = vm::get_addr(dst.pixels);
+			u32 src_address = vm::get_addr(src.pixels);

 			const f32 scale_x = fabsf(dst.scale_x);
 			const f32 scale_y = fabsf(dst.scale_y);
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -46,7 +46,6 @@ struct work_item
 	{
 		while (!processed)
 		{
-			_mm_lfence();
 			std::this_thread::yield();
 		}

--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
@ -491,7 +491,7 @@ void GLGSRender::read_buffers()
 					continue;

 				rsx::tiled_region color_buffer = get_tiled_address(offset, location & 0xf);
-				u32 texaddr = (u32)((u64)color_buffer.ptr - (u64)vm::base(0));
+				u32 texaddr = vm::get_addr(color_buffer.ptr);

 				const utils::address_range range = utils::address_range::start_length(texaddr, pitch * height);
 				bool success = m_gl_texture_cache.load_memory_from_cache(range, std::get<1>(m_rtts.m_bound_render_targets[i]));
--- a/rpcs3/Emu/RSX/RSXOffload.cpp
+++ b/rpcs3/Emu/RSX/RSXOffload.cpp
@ -123,7 +123,7 @@ namespace rsx
 		}

 		while (m_enqueued_count.load() != m_processed_count)
-			_mm_lfence();
+			_mm_pause();
 	}

 	void dma_manager::join()
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -1,4 +1,4 @@
-#include "stdafx.h"
+#include "stdafx.h"
 #include "Emu/Memory/vm.h"
 #include "Emu/System.h"
 #include "Emu/IdManager.h"
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -275,7 +275,6 @@ struct flush_request_task
 	{
 		while (num_waiters.load() != 0)
 		{
-			_mm_lfence();
 			_mm_pause();
 		}
 	}
@ -284,7 +283,6 @@ struct flush_request_task
 	{
 		while (pending_state.load())
 		{
-			_mm_lfence();
 			std::this_thread::yield();
 		}
 	}
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@ -1,4 +1,4 @@
-#include "stdafx.h"
+#include "stdafx.h"
 #include "rsx_methods.h"
 #include "RSXThread.h"
 #include "Emu/Memory/vm_reservation.h"
@ -10,6 +10,7 @@
 #include "Capture/rsx_capture.h"

 #include <thread>
+#include <atomic>

 template <>
 void fmt_class_string<frame_limit_type>::format(std::string& out, u64 arg)
@ -66,13 +67,13 @@ namespace rsx

 			// Get raw BE value
 			arg = be_t<u32>{arg}.raw();
-			const auto& sema = vm::_ref<nse_t<u32>>(addr);
+			const auto& sema = vm::_ref<atomic_t<nse_t<u32>>>(addr);

 			// TODO: Remove vblank semaphore hack
-			if (sema == arg || addr == rsx->ctxt_addr + 0x30) return;
+			if (sema.load() == arg || addr == rsx->ctxt_addr + 0x30) return;

 			u64 start = get_system_time();
-			while (sema != arg)
+			while (sema.load() != arg)
 			{
 				if (Emu.IsStopped())
 					return;
@ -107,7 +108,7 @@ namespace rsx
 			rsx->performance_counters.idle_time += (get_system_time() - start);
 		}

-		void semaphore_release(thread* rsx, u32 _reg, u32 arg)
+		void semaphore_release(thread* rsx, u32 /*_reg*/, u32 arg)
 		{
 			rsx->sync();
 			rsx->sync_point_request = true;
@ -115,7 +116,7 @@ namespace rsx

 			if (LIKELY(g_use_rtm))
 			{
-				vm::write32(addr, arg);
+				vm::_ref<atomic_t<u32>>(addr) = arg;
 			}
 			else
 			{