LV2: Optimizations and fixes

Fix and optimize sys_ppu_thread_yield Fix LV2 syscalls with timeout bug. (use ppu_thread::cancel_sleep instead) Move timeout notification out of mutex scope Allow g_waiting timeouts to be awaked in scope
2025-02-11 06:40:39 +00:00 · 2022-08-08 08:33:49 +03:00 · 2022-08-08 08:33:49 +03:00 · c4cc0154be
commit c4cc0154be
parent cba4c3cdc4
10 changed files with 260 additions and 202 deletions
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -700,6 +700,12 @@ bool cpu_thread::check_state() noexcept
 				store = true;
 			}

+			if (flags & cpu_flag::notify)
+			{
+				flags -= cpu_flag::notify;
+				store = true;
+			}
+
 			// Can't process dbg_step if we only paused temporarily
 			if (cpu_can_stop && flags & cpu_flag::dbg_step)
 			{
@ -779,6 +785,8 @@ bool cpu_thread::check_state() noexcept
 				if ((state1 ^ state) - pending_and_temp)
 				{
 					// Work could have changed flags
+					// Reset internal flags as if check_state() has just been called
+					cpu_sleep_called = false;
 					continue;
 				}
 			}
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@ -24,6 +24,7 @@ enum class cpu_flag : u32
 	memory, // Thread must unlock memory mutex
 	pending, // Thread has postponed work
 	pending_recheck, // Thread needs to recheck if there is pending work before ::pending removal
+	notify, // Flag meant solely to allow atomic notification on state without changing other flags

 	dbg_global_pause, // Emulation paused
 	dbg_pause, // Thread paused
@ -174,7 +175,7 @@ public:
 	virtual void cpu_sleep() {}

 	// Callback for cpu_flag::pending
-	virtual void cpu_work() {}
+	virtual void cpu_work() { state -= cpu_flag::pending + cpu_flag::pending_recheck; }

 	// Callback for cpu_flag::ret
 	virtual void cpu_return() {}
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -270,6 +270,8 @@ public:

 	alignas(64) const ppu_func_opd_t entry_func;
 	u64 start_time{0}; // Sleep start timepoint
+	u64 end_time{umax}; // Sleep end timepoint
+	s32 cancel_sleep{0}; // Flag to cancel the next lv2_obj::sleep call (when equals 2)
 	u64 syscall_args[8]{0}; // Last syscall arguments stored
 	const char* current_function{}; // Current function name for diagnosis, optimized for speed.
 	const char* last_function{}; // Sticky copy of current_function, is not cleared on function return
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -1206,24 +1206,27 @@ namespace cpu_counter
 	void remove(cpu_thread*) noexcept;
 }

-void lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
+bool lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
 {
 	// Should already be performed when using this flag
 	if (!g_postpone_notify_barrier)
 	{
 		prepare_for_sleep(cpu);
 	}
+
+	bool result = false;
+	const u64 current_time = get_guest_system_time();
 	{
 		std::lock_guard lock{g_mutex};
-		sleep_unlocked(cpu, timeout);
-		
+		result = sleep_unlocked(cpu, timeout, current_time);
+
 		if (!g_to_awake.empty())
 		{
 			// Schedule pending entries
 			awake_unlocked({});
 		}

-		schedule_all();
+		schedule_all(current_time);
 	}

 	if (!g_postpone_notify_barrier)
@ -1232,6 +1235,7 @@ void lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
 	}

 	g_to_awake.clear();
+	return result;
 }

 bool lv2_obj::awake(cpu_thread* thread, s32 prio)
@ -1261,19 +1265,23 @@ bool lv2_obj::awake(cpu_thread* thread, s32 prio)

 bool lv2_obj::yield(cpu_thread& thread)
 {
-	vm::temporary_unlock(thread);
-
 	if (auto ppu = thread.try_get<ppu_thread>())
 	{
 		ppu->raddr = 0; // Clear reservation
+
+		if (!atomic_storage<ppu_thread*>::load(ppu->next_ppu))
+		{
+			// Nothing to do
+			return false;
+		}
 	}

 	return awake(&thread, yield_cmd);
 }

-void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
+bool lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout, u64 current_time)
 {
-	const u64 start_time = get_guest_system_time();
+	const u64 start_time = current_time;

 	auto on_to_sleep_update = [&]()
 	{
@ -1299,15 +1307,32 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
 		}
 	};

+	bool return_val = true;
+
 	if (auto ppu = thread.try_get<ppu_thread>())
 	{
 		ppu_log.trace("sleep() - waiting (%zu)", g_pending);

-		const auto [_ ,ok] = ppu->state.fetch_op([&](bs_t<cpu_flag>& val)
+		if (ppu->ack_suspend)
+		{
+			ppu->ack_suspend = false;
+			g_pending--;
+		}
+
+		if (std::exchange(ppu->cancel_sleep, 0) == 2)
+		{
+			// Signal that the underlying LV2 operation has been cancelled and replaced with a short yield
+			return_val = false;
+		}
+
+		const auto [_, ok] = ppu->state.fetch_op([&](bs_t<cpu_flag>& val)
 		{
 			if (!(val & cpu_flag::signal))
 			{
 				val += cpu_flag::suspend;
+
+				// Flag used for forced timeout notification
+				ensure(!timeout || !(val & cpu_flag::notify)); 
 				return true;
 			}

@ -1316,8 +1341,8 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)

 		if (!ok)
 		{
-			ppu_log.trace("sleep() failed (signaled) (%s)", ppu->current_function);
-			return;
+			ppu_log.fatal("sleep() failed (signaled) (%s)", ppu->current_function);
+			return false;
 		}

 		// Find and remove the thread
@ -1328,20 +1353,17 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
 				g_to_sleep.erase(it);
 				ppu->start_time = start_time;
 				on_to_sleep_update();
+				return true;
 			}

 			// Already sleeping
 			ppu_log.trace("sleep(): called on already sleeping thread.");
-			return;
-		}
-
-		if (std::exchange(ppu->ack_suspend, false))
-		{
-			g_pending--;
+			return false;
 		}

 		ppu->raddr = 0; // Clear reservation
 		ppu->start_time = start_time;
+		ppu->end_time = timeout ? start_time + std::min<u64>(timeout, ~start_time) : u64{umax};
 	}
 	else if (auto spu = thread.try_get<spu_thread>())
 	{
@ -1349,14 +1371,15 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
 		{
 			g_to_sleep.erase(it);
 			on_to_sleep_update();
+			return true;
 		}

-		return;
+		return false;
 	}

 	if (timeout)
 	{
-		const u64 wait_until = start_time + timeout;
+		const u64 wait_until = start_time + std::min<u64>(timeout, ~start_time);

 		// Register timeout if necessary
 		for (auto it = g_waiting.cbegin(), end = g_waiting.cend();; it++)
@ -1368,6 +1391,8 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
 			}
 		}
 	}
+
+	return return_val;
 }

 bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
@ -1403,59 +1428,37 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)

 			if (ppu == cpu)
 			{
-				auto ppu2_next = &ppu->next_ppu;
+				auto ppu2 = ppu->next_ppu;

-				if (auto next = +*ppu2_next; !next || next->prio != ppu->prio)
-				{
-					return false;
-				}
-
-				for (;; i++)
-				{
-					const auto next = +*ppu2_next;
-
-					if (auto next2 = +next->next_ppu; !next2 || next2->prio != ppu->prio)
-					{
-						break;
-					}
-
-					ppu2_next = &next->next_ppu;
-				}
-
-				if (ppu2_next == &ppu->next_ppu)
+				if (!ppu2 || ppu2->prio != ppu->prio)
 				{
 					// Empty 'same prio' threads list
 					return false;
 				}

-				auto ppu2 = +*ppu2_next;
+				for (i++;; i++)
+				{
+					const auto next = ppu2->next_ppu;
+
+					if (!next || next->prio != ppu->prio)
+					{
+						break;
+					}
+
+					ppu2 = next;
+				}

 				// Rotate current thread to the last position of the 'same prio' threads list
-				*ppu_next = ppu2;
-
 				// Exchange forward pointers
-				if (ppu->next_ppu != ppu2)
-				{
-					auto ppu2_val = +ppu2->next_ppu;
-					ppu2->next_ppu = +ppu->next_ppu;
-					ppu->next_ppu = ppu2_val;
-					*ppu2_next = ppu;
-				}
-				else
-				{
-					auto ppu2_val = +ppu2->next_ppu;
-					ppu2->next_ppu = ppu;
-					ppu->next_ppu = ppu2_val;
-				}
+				*ppu_next = std::exchange(ppu->next_ppu, std::exchange(ppu2->next_ppu, ppu));

-				if (i <= g_cfg.core.ppu_threads + 0u)
+				if (i < g_cfg.core.ppu_threads + 0u)
 				{
 					// Threads were rotated, but no context switch was made
 					return false;
 				}

 				ppu->start_time = get_guest_system_time();
-				cpu = nullptr; // Disable current thread enqueing, also enable threads list enqueing
 				break;
 			}

@ -1479,6 +1482,13 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 			if (next == cpu)
 			{
 				ppu_log.trace("sleep() - suspended (p=%zu)", g_pending);
+
+				if (static_cast<ppu_thread*>(cpu)->cancel_sleep == 1)
+				{
+					// The next sleep call of the thread is cancelled
+					static_cast<ppu_thread*>(cpu)->cancel_sleep = 2;
+				}
+
 				return false;
 			}

@ -1510,19 +1520,10 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 	// Yield changed the queue before
 	bool changed_queue = prio == yield_cmd;

-	if (cpu)
+	if (cpu && prio != yield_cmd)
 	{
 		// Emplace current thread
-		if (!emplace_thread(cpu))
-		{
-			if (g_postpone_notify_barrier)
-			{
-				// This flag includes common optimizations regarding syscalls
-				// one of which is to allow a lock-free version of syscalls with awake behave as semaphore post: always notifies the thread, even if it hasn't slept yet
-				cpu->state += cpu_flag::signal;
-			}
-		}
-		else
+		if (emplace_thread(cpu))
 		{
 			changed_queue = true;
 		}
@ -1530,35 +1531,16 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 	else for (const auto _cpu : g_to_awake)
 	{
 		// Emplace threads from list
-		if (!emplace_thread(_cpu))
-		{
-			if (g_postpone_notify_barrier)
-			{
-				_cpu->state += cpu_flag::signal;
-			}
-		}
-		else
+		if (emplace_thread(_cpu))
 		{
 			changed_queue = true;
 		}
 	}

-	// Remove pending if necessary
-	if (g_pending && ((cpu && cpu == get_current_cpu_thread()) || prio == yield_cmd))
-	{
-		if (auto cur = cpu_thread::get_current<ppu_thread>())
-		{
-			if (std::exchange(cur->ack_suspend, false))
-			{
-				g_pending--;
-			}
-		}
-	}
-
 	auto target = +g_ppu;

 	// Suspend threads if necessary
-	for (usz i = 0, thread_count = g_cfg.core.ppu_threads; changed_queue && target; target = target->next_ppu, i++)
+	for (usz i = 0, thread_count = g_cfg.core.ppu_threads; target; target = target->next_ppu, i++)
 	{
 		if (i >= thread_count && cpu_flag::suspend - target->state)
 		{
@ -1574,6 +1556,17 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
 		}
 	}

+	const auto current_ppu = cpu_thread::get_current<ppu_thread>();
+
+	// Remove pending if necessary
+	if (current_ppu)
+	{
+		if (std::exchange(current_ppu->ack_suspend, false))
+		{
+			ensure(g_pending)--;
+		}
+	}
+
 	return changed_queue;
 }

@ -1585,12 +1578,12 @@ void lv2_obj::cleanup()
 	g_pending = 0;
 }

-void lv2_obj::schedule_all()
+void lv2_obj::schedule_all(u64 current_time)
 {
+	usz notify_later_idx = 0;
+
 	if (!g_pending && g_to_sleep.empty())
 	{
-		usz notify_later_idx = 0;
-
 		auto target = +g_ppu;

 		// Wake up threads
@ -1602,8 +1595,9 @@ void lv2_obj::schedule_all()
 				target->state ^= (cpu_flag::signal + cpu_flag::suspend);
 				target->start_time = 0;

-				if (notify_later_idx >= std::size(g_to_notify) - 1)
+				if (notify_later_idx == std::size(g_to_notify))
 				{
+					// Out of notification slots, notify locally (resizable container is not worth it)
 					target->state.notify_one(cpu_flag::signal + cpu_flag::suspend);
 				}
 				else
@ -1612,19 +1606,39 @@ void lv2_obj::schedule_all()
 				}
 			}
 		}
-
-		g_to_notify[notify_later_idx] = nullptr;
 	}

 	// Check registered timeouts
 	while (!g_waiting.empty())
 	{
-		auto& pair = g_waiting.front();
+		const auto pair = &g_waiting.front();

-		if (pair.first <= get_guest_system_time())
+		if (!current_time)
 		{
-			pair.second->notify();
+			current_time = get_guest_system_time();
+		}
+
+		if (pair->first <= current_time)
+		{
+			const auto target = pair->second;
 			g_waiting.pop_front();
+
+			if (target != cpu_thread::get_current())
+			{
+				// Change cpu_thread::state for the lightweight notification to work
+				ensure(!target->state.test_and_set(cpu_flag::notify));
+
+				// Otherwise notify it to wake itself
+				if (notify_later_idx == std::size(g_to_notify))
+				{
+					// Out of notification slots, notify locally (resizable container is not worth it)
+					target->state.notify_one(cpu_flag::notify);
+				}
+				else
+				{
+					g_to_notify[notify_later_idx++] = &target->state;
+				}
+			}
 		}
 		else
 		{
@ -1632,6 +1646,12 @@ void lv2_obj::schedule_all()
 			break;
 		}
 	}
+
+	if (notify_later_idx - 1 < std::size(g_to_notify) - 1) 
+	{
+		// Null-terminate the list if it ends before last slot
+		g_to_notify[notify_later_idx] = nullptr;
+	}
 }

 ppu_thread_status lv2_obj::ppu_state(ppu_thread* ppu, bool lock_idm, bool lock_lv2)
@ -1715,3 +1735,109 @@ bool lv2_obj::has_ppus_in_running_state()

 	return false;
 }
+
+bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
+{
+	static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
+
+	const u64 start_time = get_system_time();
+
+	if (cpu)
+	{
+		if (u64 end_time = cpu->end_time; end_time != umax)
+		{
+			const u64 guest_start = get_guest_system_time(start_time);
+
+			if (end_time <= guest_start)
+			{
+				return true;
+			}
+
+			usec = end_time - guest_start;
+			scale = true;
+		}
+	}
+
+	if (scale)
+	{
+		// Scale time
+		usec = std::min<u64>(usec, u64{umax} / 100) * 100 / g_cfg.core.clocks_scale;
+	}
+
+	// Clamp
+	usec = std::min<u64>(usec, max_timeout);
+
+	u64 passed = 0;
+
+	atomic_bs_t<cpu_flag> dummy{};
+	const auto& state = cpu ? cpu->state : dummy;
+	auto old_state = +state;
+
+	auto wait_for = [&](u64 timeout)
+	{
+		thread_ctrl::wait_on(state, old_state, timeout);
+	};
+
+	for (;; old_state = state)
+	{
+		if (old_state & cpu_flag::notify)
+		{
+			// Timeout notification has been forced
+			break;
+		}
+
+		if (old_state & cpu_flag::signal)
+		{
+			return false;
+		}
+
+		if (::is_stopped(old_state) || thread_ctrl::state() == thread_state::aborting)
+		{
+			return passed >= usec;
+		}
+
+		if (passed >= usec)
+		{
+			break;
+		}
+
+		u64 remaining = usec - passed;
+#ifdef __linux__
+		// NOTE: Assumption that timer initialization has succeeded
+		u64 host_min_quantum = is_usleep && remaining <= 1000 ? 10 : 50;
+#else
+		// Host scheduler quantum for windows (worst case)
+		// NOTE: On ps3 this function has very high accuracy
+		constexpr u64 host_min_quantum = 500;
+#endif
+		// TODO: Tune for other non windows operating sytems
+
+		if (g_cfg.core.sleep_timers_accuracy < (is_usleep ? sleep_timers_accuracy_level::_usleep : sleep_timers_accuracy_level::_all_timers))
+		{
+			wait_for(remaining);
+		}
+		else
+		{
+			if (remaining > host_min_quantum)
+			{
+#ifdef __linux__
+				// Do not wait for the last quantum to avoid loss of accuracy
+				wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum));
+#else
+				// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
+				wait_for(remaining - (remaining % host_min_quantum));
+#endif
+			}
+			// TODO: Determine best value for yield delay
+			else
+			{
+				// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure
+				std::this_thread::yield();
+			}
+		}
+
+		passed = get_system_time() - start_time;
+	}
+
+	return true;
+}
--- a/rpcs3/Emu/Cell/lv2/sys_lwmutex.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_lwmutex.cpp
@ -162,6 +162,8 @@ error_code _sys_lwmutex_lock(ppu_thread& ppu, u32 lwmutex_id, u64 timeout)

 		lv2_obj::prepare_for_sleep(ppu);

+		ppu.cancel_sleep = 1;
+
 		if (s32 signal = mutex.try_own(&ppu))
 		{
 			if (signal == smin)
@ -169,12 +171,13 @@ error_code _sys_lwmutex_lock(ppu_thread& ppu, u32 lwmutex_id, u64 timeout)
 				ppu.gpr[3] = CELL_EBUSY;
 			}

+			ppu.cancel_sleep = 0;
 			return true;
 		}

-		mutex.sleep(ppu, timeout);
+		const bool finished = !mutex.sleep(ppu, timeout);
 		notify.cleanup();
-		return false;
+		return finished;
 	});

 	if (!mutex)
--- a/rpcs3/Emu/Cell/lv2/sys_lwmutex.h
+++ b/rpcs3/Emu/Cell/lv2/sys_lwmutex.h
@ -135,7 +135,7 @@ struct lv2_lwmutex final : lv2_obj
 			control_data_t store = old;
 			store.signaled |= (unlock2 ? s32{smin} : 1);

-			if (lv2_control.compare_and_swap_test(old, store))
+			if (lv2_control.compare_exchange(old, store))
 			{
 				return true;
 			}
--- a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
@ -162,15 +162,19 @@ error_code sys_mutex_lock(ppu_thread& ppu, u32 mutex_id, u64 timeout)
 		{
 			lv2_obj::prepare_for_sleep(ppu);

-			if (mutex.try_own(ppu))
+			ppu.cancel_sleep = 1;
+
+			if (mutex.try_own(ppu) || !mutex.sleep(ppu, timeout))
 			{
 				result = {};
 			}
-			else
+
+			if (ppu.cancel_sleep != 1)
 			{
-				mutex.sleep(ppu, timeout);
 				notify.cleanup();
 			}
+
+			ppu.cancel_sleep = 0;
 		}

 		return result;
--- a/rpcs3/Emu/Cell/lv2/sys_sync.h
+++ b/rpcs3/Emu/Cell/lv2/sys_sync.h
@ -225,7 +225,7 @@ public:

 private:
 	// Remove the current thread from the scheduling queue, register timeout
-	static void sleep_unlocked(cpu_thread&, u64 timeout);
+	static bool sleep_unlocked(cpu_thread&, u64 timeout, u64 current_time);

 	// Schedule the thread
 	static bool awake_unlocked(cpu_thread*, s32 prio = enqueue_cmd);
@ -233,7 +233,7 @@ private:
 public:
 	static constexpr u64 max_timeout = u64{umax} / 1000;

-	static void sleep(cpu_thread& cpu, const u64 timeout = 0);
+	static bool sleep(cpu_thread& cpu, const u64 timeout = 0);

 	static bool awake(cpu_thread* thread, s32 prio = enqueue_cmd);

@ -406,95 +406,7 @@ public:
 		return make;
 	}

-	template <bool IsUsleep = false, bool Scale = true>
-	static bool wait_timeout(u64 usec, cpu_thread* const cpu = {})
-	{
-		static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
-
-		if constexpr (Scale)
-		{
-			// Scale time
-			usec = std::min<u64>(usec, u64{umax} / 100) * 100 / g_cfg.core.clocks_scale;
-		}
-
-		// Clamp
-		usec = std::min<u64>(usec, max_timeout);
-
-		u64 passed = 0;
-
-		const u64 start_time = get_system_time();
-
-		auto wait_for = [cpu](u64 timeout)
-		{
-			atomic_bs_t<cpu_flag> dummy{};
-			auto& state = cpu ? cpu->state : dummy;
-			const auto old = +state;
-
-			if (old & cpu_flag::signal)
-			{
-				return true;
-			}
-
-			thread_ctrl::wait_on(state, old, timeout);
-			return false;
-		};
-
-		while (usec >= passed)
-		{
-			u64 remaining = usec - passed;
-#ifdef __linux__
-			// NOTE: Assumption that timer initialization has succeeded
-			u64 host_min_quantum = IsUsleep && remaining <= 1000 ? 10 : 50;
-#else
-			// Host scheduler quantum for windows (worst case)
-			// NOTE: On ps3 this function has very high accuracy
-			constexpr u64 host_min_quantum = 500;
-#endif
-			// TODO: Tune for other non windows operating sytems
-			bool escape = false;
-			if (g_cfg.core.sleep_timers_accuracy < (IsUsleep ? sleep_timers_accuracy_level::_usleep : sleep_timers_accuracy_level::_all_timers))
-			{
-				escape = wait_for(remaining);
-			}
-			else
-			{
-				if (remaining > host_min_quantum)
-				{
-#ifdef __linux__
-					// Do not wait for the last quantum to avoid loss of accuracy
-					escape = wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum));
-#else
-					// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
-					escape = wait_for(remaining - (remaining % host_min_quantum));
-#endif
-				}
-				else
-				{
-					// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure
-					std::this_thread::yield();
-				}
-			}
-
-			if (auto cpu0 = get_current_cpu_thread(); cpu0 && cpu0->is_stopped())
-			{
-				return false;
-			}
-
-			if (thread_ctrl::state() == thread_state::aborting)
-			{
-				return false;
-			}
-
-			if (escape)
-			{
-				return false;
-			}
-
-			passed = get_system_time() - start_time;
-		}
-
-		return true;
-	}
+	static bool wait_timeout(u64 usec, ppu_thread* cpu = {}, bool scale = true, bool is_usleep = false);

 	static inline void notify_all()
 	{
@ -502,9 +414,7 @@ public:
 		{
 			if (!cpu)
 			{
-				g_to_notify[0] = nullptr;
-				g_postpone_notify_barrier = false;
-				return;
+				break;
 			}

 			if (cpu != &g_to_notify)
@ -514,6 +424,9 @@ public:
 				atomic_wait_engine::notify_one(cpu, 4, atomic_wait::default_mask<atomic_bs_t<cpu_flag>>);
 			}
 		}
+
+		g_to_notify[0] = nullptr;
+		g_postpone_notify_barrier = false;
 	}

 	// Can be called before the actual sleep call in order to move it out of mutex scope
@ -542,7 +455,8 @@ public:
 				}

 				// While IDM mutex is still locked (this function assumes so) check if the notification is still needed
-				if (cpu != &g_to_notify && !static_cast<const decltype(cpu_thread::state)*>(cpu)->all_of(cpu_flag::signal + cpu_flag::wait))
+				// Pending flag is meant for forced notification (if the CPU really has pending work it can restore the flag in theory)
+				if (cpu != &g_to_notify && static_cast<const decltype(cpu_thread::state)*>(cpu)->none_of(cpu_flag::signal + cpu_flag::pending))
 				{
 					// Omit it (this is a void pointer, it can hold anything)
 					cpu = &g_to_notify;
@ -575,5 +489,5 @@ private:
 	// If a notify_all_t object exists locally, postpone notifications to the destructor of it (not recursive, notifies on the first destructor for safety)
 	static thread_local bool g_postpone_notify_barrier;

-	static void schedule_all();
+	static void schedule_all(u64 current_time = 0);
 };
--- a/rpcs3/Emu/Cell/lv2/sys_timer.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_timer.cpp
@ -409,7 +409,7 @@ error_code sys_timer_usleep(ppu_thread& ppu, u64 sleep_time)
 	{
 		lv2_obj::sleep(ppu, g_cfg.core.sleep_timers_accuracy < sleep_timers_accuracy_level::_usleep ? sleep_time : 0);

-		if (!lv2_obj::wait_timeout<true>(sleep_time))
+		if (!lv2_obj::wait_timeout(sleep_time, &ppu, true, true))
 		{
 			ppu.state += cpu_flag::again;
 		}
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -3404,7 +3404,7 @@ namespace rsx
 				if (target_rsx_flip_time > time + 1000)
 				{
 					const auto delay_us = target_rsx_flip_time - time;
-					lv2_obj::wait_timeout<false, false>(delay_us);
+					lv2_obj::wait_timeout(delay_us, nullptr, false);
 					performance_counters.idle_time += delay_us;
 				}
 			}