From 6a36967e6f15790fe5dfb09bc43ca861aebc779a Mon Sep 17 00:00:00 2001 From: Eladash Date: Wed, 5 Oct 2022 18:30:12 +0300 Subject: [PATCH] SPU: Inline and batch MFC list transfers --- rpcs3/Emu/Cell/SPUThread.cpp | 525 ++++++++++++++++++++++++++++++++-- rpcs3/Emu/Memory/vm.cpp | 35 ++- rpcs3/Emu/Memory/vm_locking.h | 32 +-- rpcs3/Emu/perf_meter.hpp | 10 + 4 files changed, 535 insertions(+), 67 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 9de26568af..e1e48139b5 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -2688,79 +2688,540 @@ bool spu_thread::do_dma_check(const spu_mfc_cmd& args) bool spu_thread::do_list_transfer(spu_mfc_cmd& args) { + perf_meter<"MFC_LIST"_u64> perf0; + // Amount of elements to fetch in one go constexpr u32 fetch_size = 6; struct alignas(8) list_element { - be_t sb; // Stall-and-Notify bit (0x8000) + u8 sb; // Stall-and-Notify bit (0x80) + u8 pad; be_t ts; // List Transfer Size be_t ea; // External Address Low }; - union - { - list_element items[fetch_size]; - alignas(v128) char bufitems[sizeof(items)]; - }; + alignas(16) list_element items[fetch_size]; + static_assert(sizeof(v128) % sizeof(list_element) == 0); spu_mfc_cmd transfer; transfer.eah = 0; transfer.tag = args.tag; - transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK); - - args.lsa &= 0x3fff0; - args.eal &= 0x3fff8; + transfer.cmd = MFC{static_cast(args.cmd & ~0xf)}; u32 index = fetch_size; + auto item_ptr = _ptr(args.eal & 0x3fff8); + u32 arg_lsa = args.lsa & 0x3fff0; + u32 arg_size = args.size; + + u8 optimization_compatible = transfer.cmd & (MFC_GET_CMD | MFC_PUT_CMD); + + if (spu_log.trace || g_cfg.core.spu_accurate_dma || g_cfg.core.mfc_debug) + { + optimization_compatible = 0; + } + else if (optimization_compatible == MFC_PUT_CMD && (g_cfg.video.strict_rendering_mode || g_cfg.core.rsx_fifo_accuracy)) + { + optimization_compatible &= ~MFC_PUT_CMD; + } + + constexpr u32 ts_mask = 0x7fff; + // Assume called with size greater than 0 while (true) { // Check if fetching is needed if (index == fetch_size) { + const v128 data0 = v128::loadu(item_ptr, 0); + const v128 data1 = v128::loadu(item_ptr, 1); + const v128 data2 = v128::loadu(item_ptr, 2); + + // In a perfect world this would not be needed until after the if but relying on the compiler to keep the elements in SSE registers through it all is unrealistic + std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 0], &data0, sizeof(v128)); + std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 1], &data1, sizeof(v128)); + std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 2], &data2, sizeof(v128)); + + u32 s_size = data0._u32[0]; + + // We need to verify matching between odd and even elements (vector test is position independant) + // 0-5 is the most unlikely couple match for many reasons so it skips the entire check very efficiently in most cases + // Assumes padding bits should match + if (optimization_compatible == MFC_GET_CMD && s_size == data0._u32[2] && arg_size >= fetch_size * 8) + { + const v128 ored = (data0 | data1 | data2) & v128::from64p(std::bit_cast>(1ull << 63 | (u64{ts_mask} << 32) | 0xe000'0000)); + const v128 anded = (data0 & data1 & data2) & v128::from64p(std::bit_cast>(0xe000'0000 | (u64{ts_mask} << 32))); + + // Tests: + // 1. Unset stall-and-notify bit on all 6 elements + // 2. Equality of transfer size across all 6 elements + // 3. Be in the same 512mb region, this is because this case is not expected to be broken usually and we need to ensure MMIO is not involved in any of the transfers (assumes MMIO to be so rare that this is the last check) + if (ored == anded && items[0].ea < RAW_SPU_BASE_ADDR && items[1].ea < RAW_SPU_BASE_ADDR) + { + // Execute the postponed byteswapping and masking + s_size = std::bit_cast>(s_size) & ts_mask; + + u8* src = vm::_ptr(0); + u8* dst = this->ls + arg_lsa; + + // Assume success, prepare the next elements + arg_lsa += fetch_size * utils::align(s_size, 16); + item_ptr += fetch_size; + arg_size -= fetch_size * 8; + + // Type which is friendly for fused address calculations + constexpr usz _128 = 128; + + // This whole function relies on many constraints to be met (crashes real MFC), we can a have minor optimization assuming EA alignment to be +16 with +16 byte transfers +#define MOV_T(type, index, _ea) { const usz ea = _ea; *reinterpret_cast(dst + index * utils::align(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast(src + ea); } void() +#define MOV_128(index, ea) mov_rdata(*reinterpret_cast(dst + index * _128), *reinterpret_cast(src + (ea))) + + switch (s_size) + { + case 0: + { + if (!arg_size) + { + return true; + } + + continue; + } + case 1: + { + MOV_T(u8, 0, items[0].ea); + MOV_T(u8, 1, items[1].ea); + MOV_T(u8, 2, items[2].ea); + MOV_T(u8, 3, items[3].ea); + MOV_T(u8, 4, items[4].ea); + MOV_T(u8, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 2: + { + MOV_T(u16, 0, items[0].ea); + MOV_T(u16, 1, items[1].ea); + MOV_T(u16, 2, items[2].ea); + MOV_T(u16, 3, items[3].ea); + MOV_T(u16, 4, items[4].ea); + MOV_T(u16, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 4: + { + MOV_T(u32, 0, items[0].ea); + MOV_T(u32, 1, items[1].ea); + MOV_T(u32, 2, items[2].ea); + MOV_T(u32, 3, items[3].ea); + MOV_T(u32, 4, items[4].ea); + MOV_T(u32, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 8: + { + MOV_T(u64, 0, items[0].ea); + MOV_T(u64, 1, items[1].ea); + MOV_T(u64, 2, items[2].ea); + MOV_T(u64, 3, items[3].ea); + MOV_T(u64, 4, items[4].ea); + MOV_T(u64, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 16: + { + MOV_T(v128, 0, items[0].ea); + MOV_T(v128, 1, items[1].ea); + MOV_T(v128, 2, items[2].ea); + MOV_T(v128, 3, items[3].ea); + MOV_T(v128, 4, items[4].ea); + MOV_T(v128, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 32: + { + struct mem + { + v128 a[2]; + }; + + MOV_T(mem, 0, items[0].ea); + MOV_T(mem, 1, items[1].ea); + MOV_T(mem, 2, items[2].ea); + MOV_T(mem, 3, items[3].ea); + MOV_T(mem, 4, items[4].ea); + MOV_T(mem, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 48: + { + struct mem + { + v128 a[3]; + }; + + MOV_T(mem, 0, items[0].ea); + MOV_T(mem, 1, items[1].ea); + MOV_T(mem, 2, items[2].ea); + MOV_T(mem, 3, items[3].ea); + MOV_T(mem, 4, items[4].ea); + MOV_T(mem, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 64: + { + struct mem + { + v128 a[4]; + }; + + // TODO: Optimize (4 16-bytes movings is bad) + MOV_T(mem, 0, items[0].ea); + MOV_T(mem, 1, items[1].ea); + MOV_T(mem, 2, items[2].ea); + MOV_T(mem, 3, items[3].ea); + MOV_T(mem, 4, items[4].ea); + MOV_T(mem, 5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 128: + { + MOV_128(0, items[0].ea); + MOV_128(1, items[1].ea); + MOV_128(2, items[2].ea); + MOV_128(3, items[3].ea); + MOV_128(4, items[4].ea); + MOV_128(5, items[5].ea); + + if (!arg_size) + { + return true; + } + + continue; + } + case 256: + { + const usz ea0 = items[0].ea; + MOV_128(0, ea0 + 0); + MOV_128(1, ea0 + _128); + const usz ea1 = items[1].ea; + MOV_128(2, ea1 + 0); + MOV_128(3, ea1 + _128); + const usz ea2 = items[2].ea; + MOV_128(4, ea2 + 0); + MOV_128(5, ea2 + _128); + const usz ea3 = items[3].ea; + MOV_128(6, ea3 + 0); + MOV_128(7, ea3 + _128); + const usz ea4 = items[4].ea; + MOV_128(8, ea4 + 0); + MOV_128(9, ea4 + _128); + const usz ea5 = items[5].ea; + MOV_128(10, ea5 + 0); + MOV_128(11, ea5 + _128); + + if (!arg_size) + { + return true; + } + + continue; + } + case 512: + { + const usz ea0 = items[0].ea; + MOV_128(0 , ea0 + _128 * 0); + MOV_128(1 , ea0 + _128 * 1); + MOV_128(2 , ea0 + _128 * 2); + MOV_128(3 , ea0 + _128 * 3); + const usz ea1 = items[1].ea; + MOV_128(4 , ea1 + _128 * 0); + MOV_128(5 , ea1 + _128 * 1); + MOV_128(6 , ea1 + _128 * 2); + MOV_128(7 , ea1 + _128 * 3); + const usz ea2 = items[2].ea; + MOV_128(8 , ea2 + _128 * 0); + MOV_128(9 , ea2 + _128 * 1); + MOV_128(10, ea2 + _128 * 2); + MOV_128(11, ea2 + _128 * 3); + const usz ea3 = items[3].ea; + MOV_128(12, ea3 + _128 * 0); + MOV_128(13, ea3 + _128 * 1); + MOV_128(14, ea3 + _128 * 2); + MOV_128(15, ea3 + _128 * 3); + const usz ea4 = items[4].ea; + MOV_128(16, ea4 + _128 * 0); + MOV_128(17, ea4 + _128 * 1); + MOV_128(18, ea4 + _128 * 2); + MOV_128(19, ea4 + _128 * 3); + const usz ea5 = items[5].ea; + MOV_128(20, ea5 + _128 * 0); + MOV_128(21, ea5 + _128 * 1); + MOV_128(22, ea5 + _128 * 2); + MOV_128(23, ea5 + _128 * 3); + + if (!arg_size) + { + return true; + } + + continue; + } + default: + { + // TODO: Are more cases common enough? (in the range of less than 512 bytes because for more than that the optimization is doubtful) + break; + } + } +#undef MOV_T +#undef MOV_128 + // Optimization miss, revert changes + arg_lsa -= fetch_size * utils::align(s_size, 16); + item_ptr -= fetch_size; + arg_size += fetch_size * 8; + } + } + // Reset to elements array head index = 0; - - const auto src = _ptr(args.eal); - const v128 data0 = v128::loadu(src, 0); - const v128 data1 = v128::loadu(src, 1); - const v128 data2 = v128::loadu(src, 2); - - reinterpret_cast(bufitems)[0] = data0; - reinterpret_cast(bufitems)[1] = data1; - reinterpret_cast(bufitems)[2] = data2; } - const u32 size = items[index].ts & 0x7fff; + const u32 size = items[index].ts & ts_mask; const u32 addr = items[index].ea; - spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast>(items[index]), args.lsa | (addr & 0xf)); - - if (size) + auto check_carry_16 = [](u16 addr, u16 size) { +#ifdef _MSC_VER + u16 out; + return _addcarry_u16(0, addr, size - 1, &out); +#else + return ((addr + size - 1) >> 16) != 0; +#endif + }; + + // Try to inline the transfer + if (addr < RAW_SPU_BASE_ADDR && size && optimization_compatible == MFC_GET_CMD) + { + const u8* src = vm::_ptr(addr); + u8* dst = this->ls + arg_lsa + (addr & 0xf); + + switch (u32 _size = size) + { + case 1: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 2: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 4: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 8: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + default: + { + if (_size > s_rep_movsb_threshold) + { + __movsb(dst, src, _size); + } + else + { + // Avoid unaligned stores in mov_rdata_avx + if (reinterpret_cast(dst) & 0x10) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + _size -= 16; + } + + while (_size >= 128) + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + + dst += 128; + src += 128; + _size -= 128; + } + + while (_size) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + _size -= 16; + } + } + + break; + } + } + + arg_lsa += utils::align(size, 16); + } + // Avoid inlining huge transfers because it intentionally drops range lock unlock + else if (addr < RAW_SPU_BASE_ADDR && size - 1 <= 0x400 - 1 && optimization_compatible == MFC_PUT_CMD && !check_carry_16(static_cast(addr), static_cast(size))) + { + if (!g_use_rtm) + { + vm::range_lock(range_lock, addr & -128, utils::align(addr + size, 128) - (addr & -128)); + } + + u8* dst = vm::_ptr(addr); + const u8* src = this->ls + arg_lsa + (addr & 0xf); + + switch (u32 _size = size) + { + case 1: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 2: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 4: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 8: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + default: + { + if (_size > s_rep_movsb_threshold) + { + __movsb(dst, src, _size); + } + else + { + // Avoid unaligned stores in mov_rdata_avx + if (reinterpret_cast(dst) & 0x10) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + _size -= 16; + } + + while (_size >= 128) + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + + dst += 128; + src += 128; + _size -= 128; + } + + while (_size) + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + + dst += 16; + src += 16; + _size -= 16; + } + } + + break; + } + } + + arg_lsa += utils::align(size, 16); + } + else if (size) + { + range_lock->release(0); + spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast>(items[index]), arg_lsa | (addr & 0xf)); + transfer.eal = addr; - transfer.lsa = args.lsa | (addr & 0xf); + transfer.lsa = arg_lsa | (addr & 0xf); transfer.size = size; + arg_lsa += utils::align(size, 16); do_dma_transfer(this, transfer, ls); - const u32 add_size = std::max(size, 16); - args.lsa += add_size; } - args.size -= 8; + arg_size -= 8; - if (!args.size) + if (!arg_size) { // No more elements break; } - args.eal += 8; + item_ptr++; - if (items[index].sb & 0x8000) [[unlikely]] + if (items[index].sb & 0x80) [[unlikely]] { + range_lock->release(0); + ch_stall_mask |= utils::rol32(1, args.tag); if (!ch_stall_stat.get_count()) @@ -2771,12 +3232,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value()); args.tag |= 0x80; // Set stalled status + args.eal = reinterpret_cast(item_ptr) - this->ls; + args.lsa = arg_lsa; + args.size = arg_size; return false; } index++; } + range_lock->release(0); return true; } diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index fd1fc8aa54..929fceaf72 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -159,19 +159,17 @@ namespace vm void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size) { - perf_meter<"RHW_LOCK"_u64> perf0; + perf_meter<"RHW_LOCK"_u64> perf0(0); - auto _cpu = get_current_cpu_thread(); + cpu_thread* _cpu = nullptr; - if (_cpu) + if (u64 to_store = begin | (u64{size} << 32); *range_lock != to_store) { - _cpu->state += cpu_flag::wait + cpu_flag::temp; + range_lock->store(to_store); } for (u64 i = 0;; i++) { - range_lock->store(begin | (u64{size} << 32)); - const u64 lock_val = g_range_lock.load(); const u64 is_share = g_shmem[begin >> 16].load(); @@ -215,6 +213,11 @@ namespace vm { range_lock->release(0); + if (!perf0) + { + perf0.restart(); + } + // Try triggering a page fault (write) // TODO: Read memory if needed vm::_ref>(test / 4096 == begin / 4096 ? begin : test) += 0; @@ -223,8 +226,26 @@ namespace vm } // Wait a bit before accessing global lock - range_lock->store(0); + range_lock->release(0); + + if (!perf0) + { + perf0.restart(); + } + busy_wait(200); + + if (i >= 2 && !_cpu) + { + _cpu = cpu_thread::get_current(); + + if (_cpu) + { + _cpu->state += cpu_flag::wait + cpu_flag::temp; + } + } + + range_lock->store(begin | (u64{size} << 32)); } if (_cpu) diff --git a/rpcs3/Emu/Memory/vm_locking.h b/rpcs3/Emu/Memory/vm_locking.h index 229e5df789..a09694a78e 100644 --- a/rpcs3/Emu/Memory/vm_locking.h +++ b/rpcs3/Emu/Memory/vm_locking.h @@ -57,43 +57,15 @@ namespace vm return; } - const u64 lock_val = g_range_lock.load(); - const u64 is_share = g_shmem[begin >> 16].load(); - #ifndef _MSC_VER __asm__(""); // Tiny barrier #endif - u64 lock_addr = static_cast(lock_val); // -> u64 - u32 lock_size = static_cast(lock_val << range_bits >> (32 + range_bits)); - - u64 addr = begin; - - // Optimization: if range_locked is not used, the addr check will always pass - // Otherwise, g_shmem is unchanged and its value is reliable to read - if ((lock_val >> range_pos) == (range_locked >> range_pos)) + if (!g_range_lock) { - lock_size = 128; - - if (is_share) [[unlikely]] - { - addr = static_cast(begin) | is_share; - lock_addr = lock_val; - } + return; } - if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]] - { - const u64 new_lock_val = g_range_lock.load(); - - if (!new_lock_val || new_lock_val == lock_val) [[likely]] - { - return; - } - } - - range_lock->release(0); - // Fallback to slow path range_lock_internal(range_lock, begin, size); } diff --git a/rpcs3/Emu/perf_meter.hpp b/rpcs3/Emu/perf_meter.hpp index d439adf81f..7adfcbbafd 100644 --- a/rpcs3/Emu/perf_meter.hpp +++ b/rpcs3/Emu/perf_meter.hpp @@ -93,6 +93,16 @@ public: restart(); } + FORCE_INLINE SAFE_BUFFERS() perf_meter(int) noexcept + { + std::fill(std::begin(m_timestamps), std::end(m_timestamps), 0); + } + + FORCE_INLINE SAFE_BUFFERS(operator bool) () const noexcept + { + return m_timestamps[0] != 0; + } + // Copy first timestamp template FORCE_INLINE SAFE_BUFFERS() perf_meter(const perf_meter& r) noexcept