From 9bb7e8d614398c7a5db2a6fb6497a9807c8c4958 Mon Sep 17 00:00:00 2001 From: Elad Ashkenazi Date: Sat, 4 Jun 2022 15:35:06 +0300 Subject: [PATCH] rsx: Implement atomic FIFO fetching (stability improvement) (non-default setting) (#12107) --- Utilities/Config.h | 2 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 2 +- rpcs3/Emu/Cell/SPUThread.cpp | 2 + rpcs3/Emu/Cell/lv2/sys_rsx.cpp | 1 + rpcs3/Emu/RSX/RSXFIFO.cpp | 281 +++++++++++++++++++++++++----- rpcs3/Emu/RSX/RSXFIFO.h | 12 +- rpcs3/Emu/RSX/RSXThread.cpp | 16 +- rpcs3/Emu/RSX/RSXThread.h | 66 +++---- rpcs3/Emu/RSX/rsx_methods.cpp | 113 +++++++++--- rpcs3/Emu/system_config.h | 13 ++ rpcs3/Emu/system_config_types.cpp | 17 ++ rpcs3/Emu/system_config_types.h | 8 + rpcs3/rpcs3qt/emu_settings.cpp | 9 + rpcs3/rpcs3qt/emu_settings_type.h | 2 + rpcs3/rpcs3qt/settings_dialog.cpp | 6 + rpcs3/rpcs3qt/settings_dialog.ui | 104 ++++++----- rpcs3/rpcs3qt/tooltips.h | 1 + 17 files changed, 503 insertions(+), 152 deletions(-) diff --git a/Utilities/Config.h b/Utilities/Config.h index c8cc8b1a75..4fbe3f9524 100644 --- a/Utilities/Config.h +++ b/Utilities/Config.h @@ -180,7 +180,7 @@ namespace cfg // Value node with fixed set of possible values, each maps to an enum value of type T. template - class _enum final : public _base + class _enum : public _base { atomic_t m_value; diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 912b62b4b6..9beda9cd2a 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -6024,7 +6024,7 @@ public: break; } - if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); !g_use_rtm) + if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || !g_use_rtm) { // TODO: don't require TSX (current implementation is TSX-only) if (cmdh == MFC_PUT_CMD || cmdh == MFC_SNDSIG_CMD) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 1a9a7fd797..e85719d969 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1875,6 +1875,8 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* src = zero_buf; } + rsx::reservation_lock rsx_lock(eal, args.size, !is_get && g_cfg.core.rsx_fifo_accuracy && !g_cfg.core.spu_accurate_dma); + if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]] { perf_meter<"ADMA_GET"_u64> perf_get = perf_; diff --git a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp index 0c40ed1418..cb3e7226ce 100644 --- a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp @@ -432,6 +432,7 @@ error_code sys_rsx_context_attribute(u32 context_id, u32 package_id, u64 a3, u64 const u64 get = static_cast(a3); const u64 put = static_cast(a4); vm::_ref>(render->dma_address + ::offset32(&RsxDmaControl::put)).release(put << 32 | get); + render->fifo_ctrl->set_get(static_cast(get)); render->last_known_code_start = get; render->sync_point_request.release(true); break; diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp index 150eaffc75..a8f5c130c6 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.cpp +++ b/rpcs3/Emu/RSX/RSXFIFO.cpp @@ -4,10 +4,17 @@ #include "RSXThread.h" #include "Capture/rsx_capture.h" #include "Common/time.hpp" +#include "Emu/Memory/vm_reservation.h" #include "Emu/Cell/lv2/sys_rsx.h" +#include "util/asm.hpp" #include +using spu_rdata_t = std::byte[128]; + +extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src); +extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs); + namespace rsx { namespace FIFO @@ -32,10 +39,11 @@ namespace rsx // NOTE: Only supposed to be invoked to wait for a single arg on command[0] (4 bytes) // Wait for put to allow us to procceed execution sync_get(); + invalidate_cache(); while (read_put() == m_internal_get && !Emu.IsStopped()) { - std::this_thread::yield(); + get_current_renderer()->cpu_wait({}); } } } @@ -58,16 +66,120 @@ namespace rsx } } - void FIFO_control::set_get(u32 get, bool check_spin) + std::pair FIFO_control::fetch_u32(u32 addr) { - if (check_spin && m_ctrl->get == get) + if (addr - m_cache_addr >= m_cache_size) { - if (const u32 addr = m_iotable->get_addr(m_memwatch_addr); addr + 1) + const u32 put = read_put(); + + if (put == addr) { - m_memwatch_addr = get; - m_memwatch_cmp = vm::read32(addr); + return {false, FIFO_EMPTY}; } + m_cache_addr = addr & -128; + + const u32 addr1 = m_iotable->get_addr(m_cache_addr); + + if (addr1 == umax) + { + m_cache_size = 0; + return {false, FIFO_ERROR}; + } + + m_cache_size = std::min((put | 0x7f) - m_cache_addr, u32{sizeof(m_cache)} - 1) + 1; + + if (0x100000 - (m_cache_addr & 0xfffff) < m_cache_size) + { + // Check if memory layout changes in the next 1MB page boundary + if ((addr1 >> 20) + 1 != (m_iotable->get_addr(m_cache_addr + 0x100000) >> 20)) + { + // Trim cache as needed if memory layout changes + m_cache_size = 0x100000 - (m_cache_addr & 0xfffff); + } + } + + // Make mask of cache lines to fetch + u8 to_fetch = static_cast((1u << (m_cache_size / 128)) - 1); + + if (addr < put && put < m_cache_addr + m_cache_size) + { + // Adjust to knownly-prepared FIFO buffer bounds + m_cache_size = put - m_cache_addr; + } + + rsx::reservation_lock rsx_lock(addr1, m_cache_size, true); + + const auto src = vm::_ptr(addr1); + + // Find the next set bit after every iteration + for (u32 i = 0, start_time = 0;; i = (std::countr_zero(utils::rol8(to_fetch, 0 - i - 1)) + i + 1) % 8) + { + // If a reservation is being updated, try to load another + const auto& res = vm::reservation_acquire(addr1 + i * 128); + const u64 time0 = res; + + if (!(time0 & 127)) + { + mov_rdata(m_cache[i], src[i]); + + if (time0 == res && cmp_rdata(m_cache[i], src[i])) + { + // The fetch of the cache line content has been successful, unset its bit + to_fetch &= ~(1u << i); + + if (!to_fetch) + { + break; + } + + continue; + } + } + + if (!start_time) + { + start_time = rsx::uclock(); + } + + if (rsx::uclock() - start_time >= 50u) + { + const auto rsx = get_current_renderer(); + + if (rsx->is_stopped()) + { + return {}; + } + + rsx->cpu_wait({}); + + // Add idle time in reverse: after exchnage start_time becomes uclock(), use substruction because of the reversed order of parameters + const u64 _start = std::exchange(start_time, rsx::uclock()); + rsx->performance_counters.idle_time -= _start - start_time; + } + + busy_wait(200); + + if (g_cfg.core.rsx_fifo_accuracy >= rsx_fifo_mode::atomic_ordered) + { + i = (i - 1) % 8; + } + } + } + + be_t ret; + std::memcpy(&ret, reinterpret_cast(&m_cache) + (addr - m_cache_addr), sizeof(u32)); + return {true, ret}; + } + + void FIFO_control::set_get(u32 get, u32 spin_cmd) + { + invalidate_cache(); + + if (spin_cmd && m_ctrl->get == get) + { + m_memwatch_addr = get; + m_memwatch_cmp = spin_cmd; return; } @@ -76,21 +188,64 @@ namespace rsx m_remaining_commands = 0; } + std::span FIFO_control::get_current_arg_ptr() const + { + if (g_cfg.core.rsx_fifo_accuracy) + { + // Return a pointer to the cache storage with confined access + return {reinterpret_cast(&m_cache) + (m_internal_get - m_cache_addr) / 4, (m_cache_size - (m_internal_get - m_cache_addr)) / 4}; + } + else + { + // Return a raw pointer with no limited access + return {static_cast(vm::base(m_iotable->get_addr(m_internal_get))), 0x10000}; + } + } + bool FIFO_control::read_unsafe(register_pair& data) { // Fast read with no processing, only safe inside a PACKET_BEGIN+count block - if (m_remaining_commands && - m_internal_get != read_put()) + if (m_remaining_commands) { - m_command_reg += m_command_inc; - m_args_ptr += 4; - m_remaining_commands--; + bool ok{}; + u32 arg = 0; + + if (g_cfg.core.rsx_fifo_accuracy) + { + std::tie(ok, arg) = fetch_u32(m_internal_get + 4); + + if (!ok) + { + if (arg == FIFO_ERROR) + { + get_current_renderer()->recover_fifo(); + } + + return false; + } + } + else + { + if (m_internal_get + 4 == read_put()) + { + return false; + } + + m_args_ptr += 4; + arg = vm::read32(m_args_ptr); + } + m_internal_get += 4; - data.set(m_command_reg, vm::read32(m_args_ptr)); + m_command_reg += m_command_inc; + + --m_remaining_commands; + + data.set(m_command_reg, arg); return true; } + m_internal_get += 4; return false; } @@ -101,10 +256,8 @@ namespace rsx if (m_remaining_commands > count) { m_command_reg += m_command_inc * count; - m_args_ptr += 4 * count; m_remaining_commands -= count; m_internal_get += 4 * count; - return true; } @@ -120,19 +273,10 @@ namespace rsx void FIFO_control::read(register_pair& data) { - const u32 put = read_put(); - m_internal_get = m_ctrl->get; - - if (put == m_internal_get) - { - // Nothing to do - data.reg = FIFO_EMPTY; - return; - } - - if (m_remaining_commands && read_unsafe(data)) + if (m_remaining_commands) { // Previous block aborted to wait for PUT pointer + read_unsafe(data); return; } @@ -155,15 +299,38 @@ namespace rsx m_memwatch_cmp = 0; } - if (const u32 addr = m_iotable->get_addr(m_internal_get); addr + 1) + if (!g_cfg.core.rsx_fifo_accuracy) { - m_cmd = vm::read32(addr); + const u32 put = read_put(); + + if (put == m_internal_get) + { + // Nothing to do + data.reg = FIFO_EMPTY; + return; + } + + if (const u32 addr = m_iotable->get_addr(m_internal_get); addr + 1) + { + m_cmd = vm::read32(addr); + } + else + { + data.reg = FIFO_ERROR; + return; + } } else { - // TODO: Optional recovery - data.reg = FIFO_ERROR; - return; + if (auto [ok, arg] = fetch_u32(m_internal_get); ok) + { + m_cmd = arg; + } + else + { + data.reg = arg; + return; + } } if (m_cmd & RSX_METHOD_NON_METHOD_CMD_MASK) [[unlikely]] @@ -188,20 +355,11 @@ namespace rsx if (!count) { - m_ctrl->get.release(m_internal_get + 4); + m_ctrl->get.release(m_internal_get += 4); data.reg = FIFO_NOP; return; } - // Validate the args ptr if the command attempts to read from it - m_args_ptr = m_iotable->get_addr(m_internal_get + 4); - if (m_args_ptr == umax) [[unlikely]] - { - // Optional recovery - data.reg = FIFO_ERROR; - return; - } - if (count > 1) { // Set up readback parameters @@ -210,8 +368,43 @@ namespace rsx m_remaining_commands = count - 1; } + if (g_cfg.core.rsx_fifo_accuracy) + { + m_internal_get += 4; + + auto [ok, arg] = fetch_u32(m_internal_get); + + if (!ok) + { + // Optional recovery + if (arg == FIFO_ERROR) + { + data.reg = FIFO_ERROR; + } + else + { + data.reg = FIFO_EMPTY; + m_command_reg = m_cmd & 0xfffc; + m_remaining_commands++; + } + + return; + } + + data.set(m_cmd & 0xfffc, arg); + return; + } + inc_get(true); // Wait for data block to become available - m_internal_get += 4; + + // Validate the args ptr if the command attempts to read from it + m_args_ptr = m_iotable->get_addr(m_internal_get); + if (m_args_ptr == umax) [[unlikely]] + { + // Optional recovery + data.reg = FIFO_ERROR; + return; + } data.set(m_cmd & 0xfffc, vm::read32(m_args_ptr)); } @@ -452,7 +645,7 @@ namespace rsx } //rsx_log.warning("rsx jump(0x%x) #addr=0x%x, cmd=0x%x, get=0x%x, put=0x%x", offs, m_ioAddress + get, cmd, get, put); - fifo_ctrl->set_get(offs); + fifo_ctrl->set_get(offs, cmd); return; } if ((cmd & RSX_METHOD_CALL_CMD_MASK) == RSX_METHOD_CALL_CMD) @@ -552,9 +745,9 @@ namespace rsx commands.back().rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18); - for (u32 i = 1; i < remaining && fifo_ctrl->get_pos() + (i - 1) * 4 != (ctrl->put & ~3); i++) + for (u32 i = 1; i < remaining && fifo_ctrl->get_pos() + i * 4 != (ctrl->put & ~3); i++) { - replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i * 4))); + replay_cmd.rsx_command = std::make_pair(0, vm::read32(iomap_table.get_addr(fifo_ctrl->get_pos()) + (i * 4))); commands.push_back(replay_cmd); } diff --git a/rpcs3/Emu/RSX/RSXFIFO.h b/rpcs3/Emu/RSX/RSXFIFO.h index 2e35fe450d..4f4588fefb 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.h +++ b/rpcs3/Emu/RSX/RSXFIFO.h @@ -3,6 +3,8 @@ #include "util/types.hpp" #include "Emu/RSX/gcm_enums.h" +#include + struct RsxDmaControl; namespace rsx @@ -124,18 +126,24 @@ namespace rsx u32 m_args_ptr = 0; u32 m_cmd = ~0u; + u32 m_cache_addr = 0; + u32 m_cache_size = 0; + alignas(64) std::byte m_cache[8][128]; public: FIFO_control(rsx::thread* pctrl); ~FIFO_control() = default; + std::pair fetch_u32(u32 addr); + void invalidate_cache() { m_cache_size = 0; } + u32 get_pos() const { return m_internal_get; } u32 last_cmd() const { return m_cmd; } void sync_get() const; - u32 get_current_arg_ptr() const { return m_args_ptr; } + std::span get_current_arg_ptr() const; u32 get_remaining_args_count() const { return m_remaining_commands; } void inc_get(bool wait); - void set_get(u32 get, bool check_spin = false); + void set_get(u32 get, u32 spin_cmd = 0); void abort(); template diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index f4f965d6e1..b8ee7af39d 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -77,6 +77,12 @@ namespace rsx { std::function g_access_violation_handler; + rsx_iomap_table::rsx_iomap_table() noexcept + : ea(fill_array(-1)) + , io(fill_array(-1)) + { + } + u32 get_address(u32 offset, u32 location, u32 size_to_check, u32 line, u32 col, const char* file, const char* func) { const auto render = get_current_renderer(); @@ -2649,6 +2655,7 @@ namespace rsx { // Make sure GET value is exposed before sync points fifo_ctrl->sync_get(); + fifo_ctrl->invalidate_cache(); } std::pair thread::try_get_pc_of_x_cmds_backwards(u32 count, u32 get) const @@ -2710,6 +2717,8 @@ namespace rsx void thread::recover_fifo(u32 line, u32 col, const char* file, const char* func) { + bool kill_itself = g_cfg.core.rsx_fifo_accuracy == rsx_fifo_mode::as_ps3; + const u64 current_time = rsx::uclock(); if (recovered_fifo_cmds_history.size() == 20u) @@ -2721,13 +2730,18 @@ namespace rsx if (current_time - cmd_info.timestamp < 2'000'000u - std::min(g_cfg.video.driver_wakeup_delay * 700, 1'400'000)) { // Probably hopeless - fmt::throw_exception("Dead FIFO commands queue state has been detected!\nTry increasing \"Driver Wake-Up Delay\" setting in Advanced settings. Called from %s", src_loc{line, col, file, func}); + kill_itself = true; } // Erase the last command from history, keep the size of the queue the same recovered_fifo_cmds_history.pop(); } + if (kill_itself) + { + fmt::throw_exception("Dead FIFO commands queue state has been detected!\nTry increasing \"Driver Wake-Up Delay\" setting in Advanced settings. Called from %s", src_loc{line, col, file, func}); + } + // Error. Should reset the queue fifo_ctrl->set_get(restore_point); fifo_ret_addr = saved_fifo_ret; diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index faa7cb56b2..504e222828 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -44,13 +44,9 @@ namespace rsx { std::array, 4096> ea; std::array, 4096> io; - std::array rs{}; + std::array rs; - rsx_iomap_table() noexcept - : ea(fill_array(-1)) - , io(fill_array(-1)) - { - } + rsx_iomap_table() noexcept; // Try to get the real address given a mapped address // Returns -1 on failure @@ -59,39 +55,32 @@ namespace rsx return this->ea[offs >> 20] | (offs & 0xFFFFF); } - template + template bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept { if (len <= 1) return false; const u32 end = addr + len - 1; - for (u32 block = (addr >> 20); block <= (end >> 20); ++block) + bool added_wait = false; + + for (u32 block = addr / 8192; block <= (end / 8192); block += Stride) { auto& mutex_ = rs[block]; - if constexpr (IsFullLock) + if (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) [[ unlikely ]] { - if (self) [[ likely ]] + if (self) { - while (!mutex_.try_lock()) - { - self->cpu_wait({}); - } + added_wait |= !self->state.test_and_set(cpu_flag::wait); + } + + if (!self || self->id_type() != 0x55u) + { + IsFullLock ? mutex_.lock() : mutex_.lock_shared(); } else { - mutex_.lock(); - } - } - else - { - if (!self) [[ likely ]] - { - mutex_.lock_shared(); - } - else - { - while (!mutex_.try_lock_shared()) + while (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) { self->cpu_wait({}); } @@ -99,16 +88,21 @@ namespace rsx } } + if (added_wait) + { + self->check_state(); + } + return true; } - template + template void unlock(u32 addr, u32 len) noexcept { ensure(len >= 1); const u32 end = addr + len - 1; - for (u32 block = (addr >> 20); block <= (end >> 20); ++block) + for (u32 block = (addr / 8192); block <= (end / 8192); block += Stride) { if constexpr (IsFullLock) { @@ -847,7 +841,7 @@ namespace rsx return g_fxo->try_get(); } - template + template class reservation_lock { u32 addr = 0, length = 0; @@ -858,9 +852,7 @@ namespace rsx this->addr = addr; this->length = length; - auto renderer = get_current_renderer(); - cpu_thread* lock_owner = renderer->is_current_thread() ? renderer : nullptr; - this->locked = renderer->iomap_table.lock(addr, length, lock_owner); + this->locked = get_current_renderer()->iomap_table.lock(addr, length, get_current_cpu_thread()); } public: @@ -873,6 +865,14 @@ namespace rsx } } + reservation_lock(u32 addr, u32 length, bool setting) + { + if (setting && addr < constants::local_mem_base) + { + lock_range(addr, length); + } + } + // Multi-range lock. If ranges overlap, the combined range will be acquired. // If ranges do not overlap, the first range that is in main memory will be acquired. reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length) @@ -904,7 +904,7 @@ namespace rsx { if (locked) { - get_current_renderer()->iomap_table.unlock(addr, length); + get_current_renderer()->iomap_table.unlock(addr, length); } } }; diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 2806203374..367bea7040 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -21,7 +21,11 @@ namespace rsx const u32 cmd = rsx->get_fifo_cmd(); rsx_log.error("Invalid RSX method 0x%x (arg=0x%x, start=0x%x, count=0x%x, non-inc=%s)", reg << 2, arg, cmd & 0xfffc, (cmd >> 18) & 0x7ff, !!(cmd & RSX_METHOD_NON_INCREMENT_CMD)); - rsx->recover_fifo(); + + if (g_cfg.core.rsx_fifo_accuracy != rsx_fifo_mode::as_ps3) + { + rsx->recover_fifo(); + } } static void trace_method(thread* /*rsx*/, u32 reg, u32 arg) @@ -181,7 +185,9 @@ namespace rsx // TODO: Check if possible to write on reservations if (rsx->label_addr >> 28 != addr >> 28) { - rsx_log.fatal("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr); + rsx_log.error("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr); + rsx->recover_fifo(); + return; } write_gcm_label(rsx, addr, arg); @@ -260,13 +266,20 @@ namespace rsx return; } + const u32 addr = get_address(offset, method_registers.semaphore_context_dma_4097()); + + if (rsx->label_addr >> 28 != addr >> 28) + { + rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr); + } + if (g_cfg.video.strict_rendering_mode) [[ unlikely ]] { - write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg); + write_gcm_label(rsx, addr, arg); } else { - write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg); + write_gcm_label(rsx, addr, arg); } } @@ -283,8 +296,15 @@ namespace rsx return; } + const u32 addr = get_address(offset, method_registers.semaphore_context_dma_4097()); + + if (rsx->label_addr >> 28 != addr >> 28) + { + rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr); + } + const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff); - write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), val); + write_gcm_label(rsx, addr, val); } /** @@ -456,9 +476,16 @@ namespace rsx const u32 reg = index / 4; const u8 subreg = index % 4; - // Get real args count - const u32 count = std::min({rsx->fifo_ctrl->get_remaining_args_count() + 1, - static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos() - 4)) / 4), 32 - index}); + // FIFO args count including this one + const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1; + + // The range of methods this function resposible to + const u32 method_range = 32 - index; + + // Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min) + const u32 fifo_read_limit = static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4); + + const u32 count = std::min({fifo_args_cnt, fifo_read_limit, method_range}); const u32 load = rsx::method_registers.transform_constant_load(); @@ -476,21 +503,28 @@ namespace rsx const auto values = &rsx::method_registers.transform_constants[load + reg][subreg]; + const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr(); + + if (fifo_span.size() < rcount) + { + rcount = fifo_span.size(); + } + if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty) { // Minor optimization: don't compare values if we already know we need invalidation - copy_data_swap_u32(values, static_cast(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount); + copy_data_swap_u32(values, fifo_span.data(), rcount); } else { - if (copy_data_swap_u32_cmp(values, static_cast(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount)) + if (copy_data_swap_u32_cmp(values, fifo_span.data(), rcount)) { // Transform constants invalidation is expensive (~8k bytes per update) rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty; } } - rsx->fifo_ctrl->skip_methods(count - 1); + rsx->fifo_ctrl->skip_methods(rcount - 1); } }; @@ -500,9 +534,16 @@ namespace rsx { const u32 index = reg - NV4097_SET_TRANSFORM_PROGRAM; - // Get real args count - const u32 count = std::min({rsx->fifo_ctrl->get_remaining_args_count() + 1, - static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos() - 4)) / 4), 32 - index}); + // FIFO args count including this one + const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1; + + // The range of methods this function resposible to + const u32 method_range = 32 - index; + + // Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min) + const u32 fifo_read_limit = static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4); + + const u32 count = std::min({fifo_args_cnt, fifo_read_limit, method_range}); const u32 load_pos = rsx::method_registers.transform_program_load(); @@ -515,11 +556,18 @@ namespace rsx rcount -= max - (max_vertex_program_instructions * 4); } - copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], static_cast(vm::base(rsx->fifo_ctrl->get_current_arg_ptr())), rcount); + const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr(); + + if (fifo_span.size() < rcount) + { + rcount = fifo_span.size(); + } + + copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], fifo_span.data(), rcount); rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty; rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4)); - rsx->fifo_ctrl->skip_methods(count - 1); + rsx->fifo_ctrl->skip_methods(rcount - 1); } }; @@ -953,11 +1001,18 @@ namespace rsx } // Get position of the current command arg - const u32 src_offset = rsx->fifo_ctrl->get_pos() - 4; + const u32 src_offset = rsx->fifo_ctrl->get_pos(); - // Get real args count (starting from NV3089_COLOR) - const u32 count = std::min({rsx->fifo_ctrl->get_remaining_args_count() + 1, - static_cast(((rsx->ctrl->put & ~3ull) - src_offset) / 4), 0x700 - index, out_x_max - index}); + // FIFO args count including this one + const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1; + + // The range of methods this function resposible to + const u32 method_range = std::min(0x700 - index, out_x_max - index); + + // Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min) + const u32 fifo_read_limit = static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4); + + u32 count = std::min({fifo_args_cnt, fifo_read_limit, method_range}); const u32 dst_dma = method_registers.blit_engine_output_location_nv3062(); const u32 dst_offset = method_registers.blit_engine_output_offset_nv3062(); @@ -966,6 +1021,13 @@ namespace rsx const u32 x = method_registers.nv308a_x() + index; const u32 y = method_registers.nv308a_y(); + const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr(); + + if (fifo_span.size() < count) + { + count = fifo_span.size(); + } + // Skip "handled methods" rsx->fifo_ctrl->skip_methods(count - 1); @@ -986,12 +1048,10 @@ namespace rsx return; } - const auto src_address = get_address(src_offset, CELL_GCM_LOCATION_MAIN); - const auto dst = vm::_ptr(dst_address); - const auto src = vm::_ptr(src_address); + const auto src = reinterpret_cast(fifo_span.data()); - auto res = rsx::reservation_lock(dst_address, data_length, src_address, data_length); + rsx::reservation_lock rsx_lock(dst_address, data_length); if (rsx->fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) [[unlikely]] { @@ -1022,9 +1082,8 @@ namespace rsx const auto data_length = count * 2; const auto dst_address = get_address(dst_offset + (x * 2) + (y * out_pitch), dst_dma, data_length); - const auto src_address = get_address(src_offset, CELL_GCM_LOCATION_MAIN); const auto dst = vm::_ptr(dst_address); - const auto src = vm::_ptr(src_address); + const auto src = reinterpret_cast*>(fifo_span.data()); if (!dst_address) { @@ -1032,7 +1091,7 @@ namespace rsx return; } - auto res = rsx::reservation_lock(dst_address, data_length, src_address, data_length); + rsx::reservation_lock rsx_lock(dst_address, data_length); auto convert = [](u32 input) -> u16 { diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 671df8422b..14595f4454 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -43,6 +43,19 @@ struct cfg_root : cfg::node cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false }; cfg::_bool accurate_cache_line_stores{ this, "Accurate Cache Line Stores", false }; cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true}; + + struct fifo_setting : public cfg::_enum + { + using _enum = cfg::_enum; + using _enum::_enum; + + explicit operator bool() const + { + return get() != rsx_fifo_mode::fast; + } + }; + + fifo_setting rsx_fifo_accuracy{this, "RSX FIFO Accuracy", rsx_fifo_mode::fast }; cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled cfg::_bool spu_cache{ this, "SPU Cache", true }; cfg::_bool spu_prof{ this, "SPU Profiler", false }; diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp index f3880c0456..be1d04c902 100644 --- a/rpcs3/Emu/system_config_types.cpp +++ b/rpcs3/Emu/system_config_types.cpp @@ -187,6 +187,23 @@ void fmt_class_string::format(std::string& out, u64 arg) }); } +template <> +void fmt_class_string::format(std::string& out, u64 arg) +{ + format_enum(out, arg, [](rsx_fifo_mode value) + { + switch (value) + { + case rsx_fifo_mode::fast: return "Fast"; + case rsx_fifo_mode::atomic: return "Atomic"; + case rsx_fifo_mode::atomic_ordered: return "Ordered & Atomic"; + case rsx_fifo_mode::as_ps3: return "PS3"; + } + + return unknown; + }); +} + template <> void fmt_class_string::format(std::string& out, u64 arg) { diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h index 8ad3b1017d..ab1f52a388 100644 --- a/rpcs3/Emu/system_config_types.h +++ b/rpcs3/Emu/system_config_types.h @@ -218,6 +218,14 @@ enum class screen_quadrant bottom_right }; +enum class rsx_fifo_mode +{ + fast, + atomic, + atomic_ordered, + as_ps3, +}; + enum class tsx_usage { disabled, diff --git a/rpcs3/rpcs3qt/emu_settings.cpp b/rpcs3/rpcs3qt/emu_settings.cpp index 6fdfb349b6..0d6210afcf 100644 --- a/rpcs3/rpcs3qt/emu_settings.cpp +++ b/rpcs3/rpcs3qt/emu_settings.cpp @@ -1097,6 +1097,15 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_ case sleep_timers_accuracy_level::_all_timers: return tr("All Timers", "Sleep timers accuracy"); } break; + case emu_settings_type::FIFOAccuracy: + switch (static_cast(index)) + { + case rsx_fifo_mode::fast: return tr("Fast", "RSX FIFO Accuracy"); + case rsx_fifo_mode::atomic: return tr("Atomic", "RSX FIFO Accuracy"); + case rsx_fifo_mode::atomic_ordered: return tr("Ordered & Atomic", "RSX FIFO Accuracy"); + case rsx_fifo_mode::as_ps3: return tr("PS3", "RSX FIFO Accuracy"); + } + break; case emu_settings_type::PerfOverlayDetailLevel: switch (static_cast(index)) { diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index 959af3bbf2..acd445fb69 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -24,6 +24,7 @@ enum class emu_settings_type AccurateSpuDMA, AccurateClineStores, AccurateRSXAccess, + FIFOAccuracy, AccurateXFloat, ApproximateXFloat, AccuratePPU128Loop, @@ -194,6 +195,7 @@ inline static const QMap settings_location = { emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}}, { emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}}, { emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}}, + { emu_settings_type::FIFOAccuracy, { "Core", "RSX FIFO Accuracy"}}, { emu_settings_type::AccurateXFloat, { "Core", "Accurate xfloat"}}, { emu_settings_type::ApproximateXFloat, { "Core", "Approximate xfloat"}}, { emu_settings_type::MFCCommandsShuffling, { "Core", "MFC Commands Shuffling Limit"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index 1c8848d527..5ce41372d2 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -1341,6 +1341,12 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std m_emu_settings->EnhanceComboBox(ui->sleepTimersAccuracy, emu_settings_type::SleepTimersAccuracy); SubscribeTooltip(ui->gb_sleep_timers_accuracy, tooltips.settings.sleep_timers_accuracy); + m_emu_settings->EnhanceComboBox(ui->FIFOAccuracy, emu_settings_type::FIFOAccuracy); + SubscribeTooltip(ui->gb_rsx_fifo_accuracy, tooltips.settings.rsx_fifo_accuracy); + + // Hide a developers' setting + ui->FIFOAccuracy->removeItem(static_cast(rsx_fifo_mode::as_ps3)); + m_emu_settings->EnhanceComboBox(ui->vulkansched, emu_settings_type::VulkanAsyncSchedulerDriver); SubscribeTooltip(ui->gb_vulkansched, tooltips.settings.vulkan_async_scheduler); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 1c9b0b1c6a..fb362729a2 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -2284,6 +2284,49 @@ + + + + + 0 + 0 + + + + Clocks Scale + + + + + + Qt::Horizontal + + + + + + + + + 100% + + + Qt::AlignCenter + + + + + + + Reset + + + + + + + + @@ -2459,6 +2502,24 @@ + + + + + 0 + 0 + + + + RSX FIFO Accuracy + + + + + + + + @@ -2552,49 +2613,6 @@ - - - - - 0 - 0 - - - - Clocks Scale - - - - - - Qt::Horizontal - - - - - - - - - 100% - - - Qt::AlignCenter - - - - - - - Reset - - - - - - - - diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index 12bf0b23a8..6ff0e43b9d 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -33,6 +33,7 @@ public: const QString zcull_operation_mode = tr("Changes ZCULL report synchronization behaviour. Experiment to find the best option for your game. Approximate mode is recommended for most games.\n· Precise is the most accurate to PS3 behaviour. Required for accurate visuals in some titles such as Demon's Souls and The Darkness.\n· Approximate is a much faster way to generate occlusion data which may not always match what the PS3 would generate. Works well with most PS3 games.\n· Relaxed changes the synchronization method completely and can greatly improve performance in some games or completely break others."); const QString max_spurs_threads = tr("Limits the maximum number of SPURS threads in each thread group.\nMay improve performance in some cases, especially on systems with limited number of hardware threads.\nLimiting the number of threads is likely to cause crashes; it's recommended to keep this at the default value."); const QString sleep_timers_accuracy = tr("Changes the sleep period accuracy.\n'As Host' uses default accuracy of the underlying operating system, while 'All Timers' attempts to improve it.\n'Usleep Only' limits the adjustments to usleep syscall only.\nCan affect performance in unexpected ways."); + const QString rsx_fifo_accuracy = tr("\"Fast\" is the least accurate setting, RSX does not emulate atomic FIFO buffer.\n\"Atomic & Ordered\" is the most accurate but it is the slowest and without much stability benefit in games over \"Atomic\" which benefits stability greatly in many games with little performance penalty."); const QString vblank_rate = tr("Adjusts the frequency of vertical blanking signals that the emulator sends.\nAffects timing of events which rely on these signals."); const QString vblank_ntsc_fixup = tr("Multiplies the rate of VBLANK by 1000/1001 for values like 59.94Hz.\nKnown to fix the rhythm game Space Channel 5 Part 2"); const QString clocks_scale = tr("Changes the scale of emulated system time.\nAffects software which uses system time to calculate things such as dynamic timesteps.");