From 77e8f9a8ab551ebea5ceb511e257b519f2c0c229 Mon Sep 17 00:00:00 2001 From: Elad Ashkenazi <18193363+elad335@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:48:41 +0300 Subject: [PATCH] SPU: Utilize Operating System sleep in detected RCHCNT loop --- rpcs3/Emu/Cell/RawSPUThread.cpp | 9 +- rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 11 +- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 65 +++++++++- rpcs3/Emu/Cell/SPURecompiler.h | 1 + rpcs3/Emu/Cell/SPUThread.cpp | 157 +++++++++++++++++++------ rpcs3/Emu/Cell/SPUThread.h | 110 ++++++++--------- rpcs3/Emu/Cell/lv2/sys_spu.cpp | 18 ++- 7 files changed, 271 insertions(+), 100 deletions(-) diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp index ec571a1505..4d05fab50f 100644 --- a/rpcs3/Emu/Cell/RawSPUThread.cpp +++ b/rpcs3/Emu/Cell/RawSPUThread.cpp @@ -244,7 +244,14 @@ bool spu_thread::write_reg(const u32 addr, const u32 value) case SPU_In_MBox_offs: { - ch_in_mbox.push(value); + if (!ch_in_mbox.push(value).op_done) + { + if (auto cpu = cpu_thread::get_current()) + { + cpu->state += cpu_flag::again; + } + } + return true; } diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 46c50be7cd..274aac74e3 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -6253,10 +6253,15 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s rchcnt_loop.ch_state = vregs[op.rt]; invalidate = false; } - else if (rchcnt_loop.active && it != rchcnt_loop_all.end()) + else if (rchcnt_loop.active) { // Success - it->second.active = false; + rchcnt_loop.active = false; + + if (it == rchcnt_loop_all.end()) + { + rchcnt_loop_all.emplace(pos, rchcnt_loop); + } } break; @@ -7167,7 +7172,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (inst_attr attr = m_inst_attrs[(read_pc - entry_point) / 4]; attr == inst_attr::none) { - //add_pattern(false, inst_attr::ch_lop, get_pc - result.entry_point); + add_pattern(false, inst_attr::rchcnt_loop, read_pc - result.entry_point); spu_log.error("Channel Loop Pattern Detected! Report to developers! (read_pc=0x%x, branch_pc=0x%x, branch_target=0x%x, 0x%x-%s)", read_pc, pattern.branch_pc, pattern.branch_target, entry_point, func_hash); } diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 9772899d8a..a74b635e30 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -3487,9 +3487,28 @@ public: return m_ir->CreateTrunc(m_ir->CreateXor(shv, u64{inv}), get_type()); } + llvm::Value* wait_rchcnt(u32 off, u64 inv = 0) + { + auto wait_on_channel = [](spu_thread* _spu, spu_channel* ch, u32 is_read) -> u32 + { + if (is_read) + { + ch->pop_wait(*_spu, false); + } + else + { + ch->push_wait(*_spu, 0, false); + } + + return ch->get_count(); + }; + + return m_ir->CreateXor(call("wait_on_spu_channel", +wait_on_channel, m_thread, _ptr(m_thread, off), m_ir->getInt32(!inv)), m_ir->getInt32(inv)); + } + void RCHCNT(spu_opcode_t op) // { - value_t res; + value_t res{}; if (m_interp_magn) { @@ -3532,6 +3551,50 @@ public: } } + if (m_inst_attrs[(m_pos - m_base) / 4] == inst_attr::rchcnt_loop) + { + switch (op.ra) + { + case SPU_WrOutMbox: + { + res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); + break; + } + case SPU_WrOutIntrMbox: + { + res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); + break; + } + case SPU_RdSigNotify1: + { + res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr1)); + break; + } + case SPU_RdSigNotify2: + { + res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr2)); + break; + } + case SPU_RdInMbox: + { + auto wait_inbox = [](spu_thread* _spu, spu_channel_4_t* ch) -> u32 + { + return ch->pop_wait(*_spu, false), ch->get_count(); + }; + + res.value = call("wait_spu_inbox", +wait_inbox, m_thread, spu_ptr(&spu_thread::ch_in_mbox)); + break; + } + default: break; + } + + if (res.value) + { + set_vr(op.rt, insert(splat(0), 3, res)); + return; + } + } + switch (op.ra) { case SPU_WrOutMbox: diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index e72d1f604c..968fab287c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -390,6 +390,7 @@ protected: omit, putllc16, putllc0, + rchcnt_loop, }; std::vector m_inst_attrs; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index aca27ca70a..47dae02454 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -2235,7 +2235,7 @@ void spu_thread::push_snr(u32 number, u32 value) const bool bitor_bit = !!((snr_config >> number) & 1); // Redundant, g_use_rtm is checked inside tx_start now. - if (g_use_rtm) + if (g_use_rtm && false) { bool channel_notify = false; bool thread_notify = false; @@ -2295,8 +2295,21 @@ void spu_thread::push_snr(u32 number, u32 value) }); // Check corresponding SNR register settings - if (channel->push(value, bitor_bit)) + auto push_state = channel->push(value, bitor_bit); + + if (push_state.old_count < push_state.count) + { set_events(event_bit); + } + else if (!push_state.op_done) + { + ensure(is_stopped()); + + if (auto cpu = cpu_thread::get_current()) + { + cpu->state += cpu_flag::again; + } + } ch_events.atomic_op([](ch_events_t& ev) { @@ -6846,8 +6859,13 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop) return false; } - data = bit_wait; - jostling_value.release(bit_wait); + data = (pop ? bit_occupy : 0) | bit_wait; + + if (pop) + { + jostling_value.release(bit_occupy); + } + return true; }).first; @@ -6862,29 +6880,39 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop) if (!(data & bit_wait)) { - return static_cast(jostling_value); + return static_cast(pop ? jostling_value.exchange(0) : +data); } } + const u32 wait_on_val = static_cast(((pop ? bit_occupy : 0) | bit_wait) >> 32); + while (true) { - thread_ctrl::wait_on(utils::bless>(&data)[1], u32{bit_wait >> 32}); + thread_ctrl::wait_on(utils::bless>(&data)[1], wait_on_val); old = data; if (!(old & bit_wait)) { - return static_cast(jostling_value); + return static_cast(pop ? jostling_value.exchange(0) : +data); } if (spu.is_stopped()) { // Abort waiting and test if a value has been received - if (u64 v = jostling_value.exchange(0); !(v & bit_wait)) + if (pop) { - return static_cast(v); + if (u64 v = jostling_value.exchange(0); !(v & bit_occupy)) + { + return static_cast(v); + } + + ensure(data.fetch_and(~(bit_wait | bit_occupy)) & bit_wait); + } + else + { + data.bit_test_reset(off_wait); } - ensure(data.bit_test_reset(off_wait)); return -1; } } @@ -6898,8 +6926,8 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push) { if (data & bit_count) [[unlikely]] { - jostling_value.release(push ? value : static_cast(data)); - data |= bit_wait; + jostling_value.release(push ? (bit_occupy | value) : static_cast(data)); + data |= (push ? bit_occupy : 0) | bit_wait; } else if (push) { @@ -6919,11 +6947,6 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push) { if (!(state & bit_wait)) { - if (!push) - { - data &= ~bit_count; - } - return true; } @@ -6935,18 +6958,12 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push) { if (!(state & bit_wait)) { - if (!push) - { - data &= ~bit_count; - } - return true; } if (spu.is_stopped()) { - data &= ~bit_wait; - return false; + return !data.bit_test_reset(off_wait); } thread_ctrl::wait_on(utils::bless>(&data)[1], u32(state >> 32)); @@ -6954,12 +6971,17 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push) } } -std::pair spu_channel_4_t::pop_wait(cpu_thread& spu) +std::pair spu_channel_4_t::pop_wait(cpu_thread& spu, bool pop_value) { auto old = values.fetch_op([&](sync_var_t& data) { if (data.count != 0) { + if (!pop_value) + { + return; + } + data.waiting = 0; data.count--; @@ -6969,8 +6991,8 @@ std::pair spu_channel_4_t::pop_wait(cpu_thread& spu) } else { - data.waiting = 1; - jostling_value.release(bit_wait); + data.waiting = (pop_value ? bit_occupy : 0) | bit_wait; + jostling_value.release(pop_value ? jostling_flag : 0); } }); @@ -6979,7 +7001,7 @@ std::pair spu_channel_4_t::pop_wait(cpu_thread& spu) return {old.count, old.value0}; } - old.waiting = 1; + old.waiting = (pop_value ? bit_occupy : 0) | bit_wait; for (int i = 0; i < 10; i++) { @@ -6987,7 +7009,7 @@ std::pair spu_channel_4_t::pop_wait(cpu_thread& spu) if (!atomic_storage::load(values.raw().waiting)) { - return {1, static_cast(jostling_value)}; + return {1, static_cast(pop_value ? jostling_value.exchange(0) : 0)}; } } @@ -6996,26 +7018,91 @@ std::pair spu_channel_4_t::pop_wait(cpu_thread& spu) thread_ctrl::wait_on(utils::bless>(&values)[0], u32(u64(std::bit_cast(old)))); old = values; - if (!old.waiting) + if (~old.waiting & bit_wait) { // Count of 1 because a value has been inserted and popped in the same step. - return {1, static_cast(jostling_value)}; + return {1, static_cast(pop_value ? jostling_value.exchange(0) : 0)}; } if (spu.is_stopped()) { - // Abort waiting and test if a value has been received - if (u64 v = jostling_value.exchange(0); !(v & bit_wait)) + if (pop_value) { - return {1, static_cast(v)}; + // Abort waiting and test if a value has been received + if (u64 v = jostling_value.exchange(0); !(v & jostling_flag)) + { + return {1, static_cast(v)}; + } + } + + if (~atomic_storage::exchange(values.raw().waiting, 0) & bit_wait) + { + // Count of 1 because a value has been inserted and popped in the same step. + return {1, static_cast(pop_value ? jostling_value.exchange(0) : 0)}; } - ensure(atomic_storage::exchange(values.raw().waiting, 0)); return {}; } } } +spu_channel_op_state spu_channel_4_t::push(u32 value, bool postpone_notify) +{ + while (true) + { + value3.release(value); + const auto [old, pushed_to_data] = values.fetch_op([&](sync_var_t& data) + { + if (data.waiting & bit_occupy) + { + return false; + } + + switch (data.count++) + { + case 0: data.value0 = value; break; + case 1: data.value1 = value; break; + case 2: data.value2 = value; break; + default: + { + data.count = 4; + data.value3_inval++; // Ensure the SPU reads the most recent value3 write in try_pop by re-loading + break; + } + } + + return true; + }); + + if (!pushed_to_data) + { + // Insert the pending value in special storage for waiting SPUs, leave no time in which the channel has data + if (!jostling_value.compare_and_swap_test(jostling_flag, value)) + { + // Other thread has inserted a value through jostling_value, retry + continue; + } + } + + if (old.waiting & bit_wait) + { + // Turn off waiting bit manually (must succeed because waiting bit can only be resetted by the thread pushing to jostling_value) + if (~atomic_storage::exchange(values.raw().waiting, 0) & bit_wait) + { + // Could be fatal or at emulation stopping, to be checked by the caller + return { old.count, old.count, false, false }; + } + + if (!postpone_notify) + { + utils::bless>(&values)[0].notify_one(); + } + } + + return { old.count, std::min(static_cast(old.count + 1), 4), !!(old.waiting & bit_wait), true }; + } +} + template <> void fmt_class_string::format(std::string& out, u64 arg) { diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index e1163a859d..ea7066a78e 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -176,6 +176,14 @@ enum : u32 SPU_FAKE_BASE_ADDR = 0xE8000000, }; +struct spu_channel_op_state +{ + u8 old_count; + u8 count; + bool notify; + bool op_done; +}; + struct alignas(16) spu_channel { // Low 32 bits contain value @@ -186,8 +194,10 @@ struct alignas(16) spu_channel public: static constexpr u32 off_wait = 32; + static constexpr u32 off_occupy = 32; static constexpr u32 off_count = 63; static constexpr u64 bit_wait = 1ull << off_wait; + static constexpr u64 bit_occupy = 1ull << off_occupy; static constexpr u64 bit_count = 1ull << off_count; // Returns true on success @@ -207,20 +217,21 @@ public: // Push unconditionally, may require notification // Performing bitwise OR with previous value if specified, otherwise overwiting it - bool push(u32 value, bool to_or = false) + // Returns old count and new count + spu_channel_op_state push(u32 value, bool to_or = false, bool postpone_notify = false) { while (true) { const auto [old, pushed_to_data] = data.fetch_op([&](u64& data) { - if (data == bit_wait) + if (data & bit_occupy) { return false; } if (to_or) { - data |= bit_count | value; + data = bit_count | (static_cast(data) | value); } else { @@ -233,26 +244,42 @@ public: if (!pushed_to_data) { // Insert the pending value in special storage for waiting SPUs, leave no time in which the channel has data - if (!jostling_value.compare_and_swap_test(bit_wait, value)) + if (!jostling_value.compare_and_swap_test(bit_occupy, value)) { // Other thread has inserted a value through jostling_value, retry continue; } + } + if (old & bit_wait) + { // Turn off waiting bit manually (must succeed because waiting bit can only be resetted by the thread pushed to jostling_value) - ensure(this->data.bit_test_reset(off_wait)); - utils::bless>(&data)[1].notify_one(); + if (!this->data.bit_test_reset(off_wait)) + { + // Could be fatal or at emulation stopping, to be checked by the caller + return { (old & bit_count) == 0, 0, false, false }; + } + + if (!postpone_notify) + { + utils::bless>(&data)[1].notify_one(); + } } // Return true if count has changed from 0 to 1, this condition is considered satisfied even if we pushed a value directly to the special storage for waiting SPUs - return !pushed_to_data || (old & bit_count) == 0; + return { (old & bit_count) == 0, 1, (old & bit_wait) != 0, true }; } } + void notify() + { + utils::bless>(&data)[1].notify_one(); + } + // Returns true on success bool try_pop(u32& out) { - return data.fetch_op([&](u64& data) + return data.fetch_op([&out](u64& data) { if (data & bit_count) [[likely]] { @@ -284,7 +311,7 @@ public: u32 pop() { // Value is not cleared and may be read again - constexpr u64 mask = bit_count | bit_wait; + constexpr u64 mask = bit_count | bit_occupy; const u64 old = data.fetch_op([&](u64& data) { @@ -295,10 +322,10 @@ public: return; } - data &= ~mask; + data &= ~(mask | bit_wait); }); - if ((old & mask) == mask) + if (old & bit_wait) { utils::bless>(&data)[1].notify_one(); } @@ -324,7 +351,7 @@ public: u32 get_count() const { - return static_cast(data >> off_count); + return (data & bit_count) ? 1 : 0; } }; @@ -344,59 +371,26 @@ struct spu_channel_4_t atomic_t jostling_value; atomic_t value3; - static constexpr u32 off_wait = 32; + static constexpr u32 off_wait = 0; + static constexpr u32 off_occupy = 7; static constexpr u64 bit_wait = 1ull << off_wait; + static constexpr u64 bit_occupy = 1ull << off_occupy; + static constexpr u64 jostling_flag = 1ull << 63; void clear() { values.release({}); + jostling_value.release(0); + value3.release(0); } // push unconditionally (overwriting latest value), returns true if needs signaling - void push(u32 value) + // returning if could be aborted (operation failed unexpectedly) + spu_channel_op_state push(u32 value, bool postpone_notify = false); + + void notify() { - while (true) - { - value3.release(value); - const auto [old, pushed_to_data] = values.fetch_op([&](sync_var_t& data) - { - if (data.waiting) - { - return false; - } - - switch (data.count++) - { - case 0: data.value0 = value; break; - case 1: data.value1 = value; break; - case 2: data.value2 = value; break; - default: - { - data.count = 4; - data.value3_inval++; // Ensure the SPU reads the most recent value3 write in try_pop by re-loading - break; - } - } - - return true; - }); - - if (!pushed_to_data) - { - // Insert the pending value in special storage for waiting SPUs, leave no time in which the channel has data - if (!jostling_value.compare_and_swap_test(bit_wait, value)) - { - // Other thread has inserted a value through jostling_value, retry - continue; - } - - // Turn off waiting bit manually (must succeed because waiting bit can only be resetted by the thread pushing to jostling_value) - ensure(atomic_storage::exchange(values.raw().waiting, 0)); - utils::bless>(&values)[0].notify_one(); - } - - return; - } + utils::bless>(&values)[0].notify_one(); } // returns non-zero value on success: queue size before removal @@ -422,7 +416,7 @@ struct spu_channel_4_t } // Returns [previous count, value] (if aborted 0 count is returned) - std::pair pop_wait(cpu_thread& spu); + std::pair pop_wait(cpu_thread& spu, bool pop_value = true); // returns current queue size without modification uint try_read(u32 (&out)[4]) const @@ -443,7 +437,7 @@ struct spu_channel_4_t u32 get_count() const { - return std::as_const(values).raw().count; + return atomic_storage::load(std::as_const(values).raw().count); } void set_values(u32 count, u32 value0, u32 value1 = 0, u32 value2 = 0, u32 value3 = 0) diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.cpp b/rpcs3/Emu/Cell/lv2/sys_spu.cpp index 79de82f106..b8f71d85cd 100644 --- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp @@ -1749,9 +1749,23 @@ error_code sys_spu_thread_write_spu_mb(ppu_thread& ppu, u32 id, u32 value) return CELL_ESRCH; } - std::lock_guard lock(group->mutex); + spu_channel_op_state state{}; + { + std::lock_guard lock(group->mutex); - thread->ch_in_mbox.push(value); + state = thread->ch_in_mbox.push(value, true); + } + + if (!state.op_done) + { + ppu.state += cpu_flag::again; + return {}; + } + + if (state.notify) + { + thread->ch_in_mbox.notify(); + } return CELL_OK; }