1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2025-03-15 04:21:01 +00:00

SPU: Operating system LR memory signals

This commit is contained in:
Elad 2024-12-25 11:35:03 +02:00
parent e135012acd
commit a8bfa6a16c
5 changed files with 366 additions and 36 deletions

@ -341,6 +341,21 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
#endif
}
#ifdef _MSC_VER
__forceinline
#endif
extern u32 compute_rdata_hash32(const spu_rdata_t& _src)
{
const auto rhs = reinterpret_cast<const v128*>(_src);
const v128 a = gv_add32(rhs[0], rhs[1]);
const v128 c = gv_add32(rhs[4], rhs[5]);
const v128 b = gv_add32(rhs[2], rhs[3]);
const v128 d = gv_add32(rhs[6], rhs[7]);
const v128 r = gv_add32(gv_add32(a, b), gv_add32(c, d));
const v128 r1 = gv_add32(r, gv_shuffle32<1, 0, 3, 2>(r));
return r1._u32[0] + r1._u32[2];
}
#if defined(ARCH_X64)
static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
{
@ -4718,6 +4733,12 @@ bool spu_thread::process_mfc_cmd()
busy_wait(300);
}
if (getllar_spin_count == 3)
{
// Check other reservations in other threads
lv2_obj::notify_all();
}
// Reset perf
perf0.restart();
}
@ -4729,12 +4750,17 @@ bool spu_thread::process_mfc_cmd()
// Spinning, might as well yield cpu resources
state += cpu_flag::wait;
usz cache_line_waiter_index = umax;
if (auto wait_var = vm::reservation_notifier_begin_wait(addr, rtime))
{
cache_line_waiter_index = register_cache_line_waiter(addr);
utils::bless<atomic_t<u32>>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{100'000});
vm::reservation_notifier_end_wait(*wait_var);
}
deregister_cache_line_waiter(cache_line_waiter_index);
static_cast<void>(test_stopped());
// Quick check if there were reservation changes
@ -5372,6 +5398,140 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
return !res;
}
bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_lock)
{
if ((addr >> 28) < 2 || (addr >> 28) == 0xd)
{
// Always-allocated memory does not need strict checking (vm::main or vm::stack)
return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) == hash;
}
// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
u64 lock_val = *std::prev(std::end(vm::g_range_lock_set));
u64 old_lock = 0;
while (lock_val != old_lock)
{
// Since we want to read data, let's check readability first
if (!(lock_val & vm::range_readable))
{
// Only one abnormal operation is "unreadable"
if ((lock_val >> vm::range_pos) == (vm::range_locked >> vm::range_pos))
{
// All page flags are untouched and can be read safely
if (!vm::check_addr(addr))
{
// Assume our memory is being (de)allocated
range_lock->release(0);
break;
}
// g_shmem values are unchanged too
const u64 is_shmem = vm::g_shmem[addr >> 16];
const u64 test_addr = is_shmem ? (is_shmem | static_cast<u16>(addr)) / 128 : u64{addr} / 128;
const u64 lock_addr = lock_val / 128;
if (test_addr == lock_addr)
{
// Our reservation is locked
range_lock->release(0);
break;
}
break;
}
}
// Fallback to normal range check
const u64 lock_addr = static_cast<u32>(lock_val);
const u32 lock_size = static_cast<u32>(lock_val << 3 >> 35);
if (lock_addr + lock_size <= addr || lock_addr >= addr + 128)
{
// We are outside locked range, so page flags are unaffected
if (!vm::check_addr(addr))
{
range_lock->release(0);
break;
}
}
else if (!(lock_val & vm::range_readable))
{
range_lock->release(0);
break;
}
old_lock = std::exchange(lock_val, *std::prev(std::end(vm::g_range_lock_set)));
}
if (!range_lock->load()) [[unlikely]]
{
return true;
}
const bool res = compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) == hash;
range_lock->release(0);
return !res;
}
usz spu_thread::register_cache_line_waiter(u32 addr)
{
const u64 value = u64{compute_rdata_hash32(rdata)} << 32 | raddr;
for (usz i = 0; i < std::size(g_spu_waiters_by_value); i++)
{
auto [old, ok] = g_spu_waiters_by_value[i].fetch_op([value](u64& x)
{
if (x == 0)
{
x = value + 1;
return true;
}
if ((x & -128) == value)
{
x++;
return true;
}
return false;
});
if (ok)
{
return i;
}
}
return umax;
}
void spu_thread::deregister_cache_line_waiter(usz index)
{
if (index == umax)
{
return;
}
g_spu_waiters_by_value[index].fetch_op([](u64& x)
{
x--;
if ((x & 127) == 0)
{
x = 0;
}
return false;
});
}
std::pair<u32, u32> spu_thread::read_dec() const
{
const u64 res = ch_dec_value - (is_dec_frozen ? 0 : (get_timebased_time() - ch_dec_start_timestamp));
@ -5739,6 +5899,24 @@ s64 spu_thread::get_ch_value(u32 ch)
#else
const bool reservation_busy_waiting = (seed + ((raddr == spurs_addr) ? 50u : 0u)) < g_cfg.core.spu_reservation_busy_waiting_percentage;
#endif
usz cache_line_waiter_index = umax;
auto check_cache_line_waiter = [&]()
{
if (cache_line_waiter_index == umax)
{
return true;
}
if ((g_spu_waiters_by_value[cache_line_waiter_index] & -128) == 0)
{
deregister_cache_line_waiter(cache_line_waiter_index);
cache_line_waiter_index = umax;
return false;
}
return true;
};
for (; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true))
{
@ -5746,12 +5924,22 @@ s64 spu_thread::get_ch_value(u32 ch)
if (is_stopped(old))
{
if (cache_line_waiter_index != umax)
{
g_spu_waiters_by_value[cache_line_waiter_index].release(0);
}
return -1;
}
// Optimized check
if (raddr)
if (raddr && mask1 & SPU_EVENT_LR)
{
if (cache_line_waiter_index == umax)
{
cache_line_waiter_index = register_cache_line_waiter(raddr);
}
bool set_lr = false;
if (!vm::check_addr(raddr) || rtime != vm::reservation_acquire(raddr))
@ -5819,13 +6007,20 @@ s64 spu_thread::get_ch_value(u32 ch)
}
}
// Check other reservations in other threads
lv2_obj::notify_all();
if (raddr - spurs_addr <= 0x80 && !g_cfg.core.spu_accurate_reservations && mask1 == SPU_EVENT_LR)
{
// Wait with extended timeout, in this situation we have notifications for nearly all writes making it possible
// Abort notifications are handled specially for performance reasons
if (auto wait_var = vm::reservation_notifier_begin_wait(raddr, rtime))
{
if (check_cache_line_waiter())
{
utils::bless<atomic_t<u32>>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{300'000});
}
vm::reservation_notifier_end_wait(*wait_var);
}
@ -5834,9 +6029,14 @@ s64 spu_thread::get_ch_value(u32 ch)
const u32 _raddr = this->raddr;
#ifdef __linux__
if (auto wait_var = vm::reservation_notifier_begin_wait(_raddr, rtime))
{
if (check_cache_line_waiter())
{
utils::bless<atomic_t<u32>>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{50'000});
}
vm::reservation_notifier_end_wait(*wait_var);
}
#else
@ -5891,13 +6091,20 @@ s64 spu_thread::get_ch_value(u32 ch)
return false;
}
// Check other reservations in other threads
lv2_obj::notify_all();
return true;
};
if (auto wait_var = vm::reservation_notifier_begin_wait(_raddr, rtime))
{
if (check_cache_line_waiter())
{
atomic_wait_engine::set_one_time_use_wait_callback(wait_cb);
utils::bless<atomic_t<u32>>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{80'000});
}
vm::reservation_notifier_end_wait(*wait_var);
}
@ -5918,6 +6125,8 @@ s64 spu_thread::get_ch_value(u32 ch)
thread_ctrl::wait_on(state, old, 100);
}
deregister_cache_line_waiter(cache_line_waiter_index);
wakeup_delay();
if (is_paused(state - cpu_flag::suspend))
@ -6617,6 +6826,8 @@ bool spu_thread::stop_and_signal(u32 code)
}
}
lv2_obj::notify_all();
while (auto old = +state)
{
if (old & cpu_flag::signal && state.test_and_reset(cpu_flag::signal))
@ -7185,6 +7396,8 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop)
}
}
lv2_obj::notify_all();
const u32 wait_on_val = static_cast<u32>(((pop ? bit_occupy : 0) | bit_wait) >> 32);
while (true)
@ -7470,3 +7683,4 @@ void fmt_class_string<spu_channel_4_t>::format(std::string& out, u64 arg)
DECLARE(spu_thread::g_raw_spu_ctr){};
DECLARE(spu_thread::g_raw_spu_id){};
DECLARE(spu_thread::g_spu_work_count){};
DECLARE(spu_thread::g_spu_waiters_by_value){};

@ -888,6 +888,9 @@ public:
// Returns true if reservation existed but was just discovered to be lost
// It is safe to use on any address, even if not directly accessed by SPU (so it's slower)
bool reservation_check(u32 addr, const decltype(rdata)& data) const;
static bool reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_lock);
usz register_cache_line_waiter(u32 addr);
void deregister_cache_line_waiter(usz index);
bool read_reg(const u32 addr, u32& value);
bool write_reg(const u32 addr, const u32 value);
@ -897,6 +900,8 @@ public:
static atomic_t<u32> g_raw_spu_id[5];
static atomic_t<u32> g_spu_work_count;
static atomic_t<u64> g_spu_waiters_by_value[6];
static u32 find_raw_spu(u32 id)
{
if (id < std::size(g_raw_spu_id)) [[likely]]

@ -6,6 +6,8 @@
#include "Emu/Memory/vm_locking.h"
#include "Emu/Cell/PPUFunction.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/ErrorCodes.h"
#include "Emu/Cell/MFC.h"
#include "sys_sync.h"
@ -54,6 +56,7 @@
#include <algorithm>
#include <optional>
#include <deque>
#include <shared_mutex>
#include "util/tsc.hpp"
#include "util/sysinfo.hpp"
#include "util/init_mutex.hpp"
@ -75,6 +78,9 @@ namespace rsx
void set_rsx_yield_flag() noexcept;
}
using spu_rdata_t = decltype(spu_thread::rdata);
extern u32 compute_rdata_hash32(const spu_rdata_t& _src);
template <>
void fmt_class_string<ppu_syscall_code>::format(std::string& out, u64 arg)
{
@ -2215,3 +2221,129 @@ void lv2_obj::prepare_for_sleep(cpu_thread& cpu)
vm::temporary_unlock(cpu);
cpu_counter::remove(&cpu);
}
void lv2_obj::notify_all() noexcept
{
for (auto cpu : g_to_notify)
{
if (!cpu)
{
break;
}
if (cpu != &g_to_notify)
{
const auto res_start = vm::reservation_notifier(0).second;
const auto res_end = vm::reservation_notifier(umax).second;
if (cpu >= res_start && cpu <= res_end)
{
atomic_wait_engine::notify_all(cpu);
}
else
{
// Note: by the time of notification the thread could have been deallocated which is why the direct function is used
atomic_wait_engine::notify_one(cpu);
}
}
}
g_to_notify[0] = nullptr;
g_postpone_notify_barrier = false;
const auto cpu = cpu_thread::get_current();
if (!cpu)
{
return;
}
if (cpu->get_class() != thread_class::spu && cpu->state.none_of(cpu_flag::suspend))
{
return;
}
std::optional<vm::writer_lock> lock;
constexpr usz total_waiters = std::size(spu_thread::g_spu_waiters_by_value);
u32 notifies[total_waiters]{};
// There may be 6 waiters, but checking them all may be performance expensive
// Instead, check 2 at max, but use the CPU ID index to tell which index to start checking so the work would be distributed across all threads
atomic_t<u64, 64>* range_lock = nullptr;
for (usz i = 0, checked = 0; checked < 3 && i < total_waiters; i++)
{
auto& waiter = spu_thread::g_spu_waiters_by_value[(i + cpu->id) % total_waiters];
const u64 value = waiter.load();
u32 raddr = static_cast<u32>(value) & -128;
if (vm::check_addr(raddr))
{
if (((raddr >> 28) < 2 || (raddr >> 28) == 0xd))
{
checked++;
if (compute_rdata_hash32(*vm::get_super_ptr<spu_rdata_t>(raddr)) != static_cast<u32>(value >> 32))
{
// Clear address to avoid a race, keep waiter counter
if (waiter.fetch_op([&](u64& x)
{
if ((x & -128) == (value & -128))
{
x &= 127;
return true;
}
return false;
}).second)
{
notifies[i] = raddr;
}
}
continue;
}
if (!range_lock)
{
range_lock = vm::alloc_range_lock();
}
checked++;
if (spu_thread::reservation_check(raddr, static_cast<u32>(value >> 32), range_lock))
{
// Clear address to avoid a race, keep waiter counter
if (waiter.fetch_op([&](u64& x)
{
if ((x & -128) == (value & -128))
{
x &= 127;
return true;
}
return false;
}).second)
{
notifies[i] = raddr;
}
}
}
}
if (range_lock)
{
vm::free_range_lock(range_lock);
}
for (u32 addr : notifies)
{
if (addr)
{
vm::reservation_notifier_notify(addr);
}
}
}

@ -166,8 +166,16 @@ s32 sys_ppu_thread_yield(ppu_thread& ppu)
sys_ppu_thread.trace("sys_ppu_thread_yield()");
const s32 success = lv2_obj::yield(ppu) ? CELL_OK : CELL_CANCEL;
if (success == CELL_CANCEL)
{
// Do other work in the meantime
lv2_obj::notify_all();
}
// Return 0 on successful context switch, 1 otherwise
return +!lv2_obj::yield(ppu);
return success;
}
error_code sys_ppu_thread_join(ppu_thread& ppu, u32 thread_id, vm::ptr<u64> vptr)

@ -454,36 +454,7 @@ public:
static bool wait_timeout(u64 usec, ppu_thread* cpu = {}, bool scale = true, bool is_usleep = false);
static inline void notify_all()
{
for (auto cpu : g_to_notify)
{
if (!cpu)
{
break;
}
if (cpu != &g_to_notify)
{
const auto res_start = vm::reservation_notifier(0).second;
const auto res_end = vm::reservation_notifier(umax).second;
if (cpu >= res_start && cpu <= res_end)
{
// Notify SPU reservation
atomic_wait_engine::notify_all(cpu);
}
else
{
// Note: by the time of notification the thread could have been deallocated which is why the direct function is used
atomic_wait_engine::notify_one(cpu);
}
}
}
g_to_notify[0] = nullptr;
g_postpone_notify_barrier = false;
}
static void notify_all() noexcept;
// Can be called before the actual sleep call in order to move it out of mutex scope
static void prepare_for_sleep(cpu_thread& cpu);