SPU: Inline and batch MFC list transfers

2025-01-29 00:33:01 +00:00 · 2022-10-05 18:30:12 +03:00 · 2022-10-05 18:30:12 +03:00 · 6a36967e6f
commit 6a36967e6f
parent d6d7ade6e3
4 changed files with 535 additions and 67 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -2688,79 +2688,540 @@ bool spu_thread::do_dma_check(const spu_mfc_cmd& args)

 bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 {
+	perf_meter<"MFC_LIST"_u64> perf0;
+
 	// Amount of elements to fetch in one go
 	constexpr u32 fetch_size = 6;

 	struct alignas(8) list_element
 	{
-		be_t<u16> sb; // Stall-and-Notify bit (0x8000)
+		u8 sb; // Stall-and-Notify bit (0x80)
+		u8 pad;
 		be_t<u16> ts; // List Transfer Size
 		be_t<u32> ea; // External Address Low
 	};

-	union
-	{
-		list_element items[fetch_size];
-		alignas(v128) char bufitems[sizeof(items)];
-	};
+	alignas(16) list_element items[fetch_size];
+	static_assert(sizeof(v128) % sizeof(list_element) == 0);

 	spu_mfc_cmd transfer;
 	transfer.eah  = 0;
 	transfer.tag  = args.tag;
-	transfer.cmd  = MFC(args.cmd & ~MFC_LIST_MASK);
-
-	args.lsa &= 0x3fff0;
-	args.eal &= 0x3fff8;
+	transfer.cmd  = MFC{static_cast<u8>(args.cmd & ~0xf)};

 	u32 index = fetch_size;

+	auto item_ptr = _ptr<const list_element>(args.eal & 0x3fff8);
+	u32 arg_lsa = args.lsa & 0x3fff0;
+	u32 arg_size = args.size;
+
+	u8 optimization_compatible = transfer.cmd & (MFC_GET_CMD | MFC_PUT_CMD);
+
+	if (spu_log.trace || g_cfg.core.spu_accurate_dma || g_cfg.core.mfc_debug)
+	{
+		optimization_compatible = 0;
+	}
+	else if (optimization_compatible == MFC_PUT_CMD && (g_cfg.video.strict_rendering_mode || g_cfg.core.rsx_fifo_accuracy))
+	{
+		optimization_compatible &= ~MFC_PUT_CMD;
+	}
+
+	constexpr u32 ts_mask = 0x7fff;
+
 	// Assume called with size greater than 0
 	while (true)
 	{
 		// Check if fetching is needed
 		if (index == fetch_size)
 		{
+			const v128 data0 = v128::loadu(item_ptr, 0);
+			const v128 data1 = v128::loadu(item_ptr, 1);
+			const v128 data2 = v128::loadu(item_ptr, 2);
+
+			// In a perfect world this would not be needed until after the if but relying on the compiler to keep the elements in SSE registers through it all is unrealistic
+			std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 0], &data0, sizeof(v128));
+			std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 1], &data1, sizeof(v128));
+			std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 2], &data2, sizeof(v128));
+
+			u32 s_size = data0._u32[0];
+
+			// We need to verify matching between odd and even elements (vector test is position independant)
+			// 0-5 is the most unlikely couple match for many reasons so it skips the entire check very efficiently in most cases
+			// Assumes padding bits should match
+			if (optimization_compatible == MFC_GET_CMD && s_size == data0._u32[2] && arg_size >= fetch_size * 8)
+			{
+				const v128 ored = (data0 | data1 | data2) & v128::from64p(std::bit_cast<be_t<u64>>(1ull << 63 | (u64{ts_mask} << 32) | 0xe000'0000));
+				const v128 anded = (data0 & data1 & data2) & v128::from64p(std::bit_cast<be_t<u64>>(0xe000'0000 | (u64{ts_mask} << 32)));
+
+				// Tests:
+				// 1. Unset stall-and-notify bit on all 6 elements
+				// 2. Equality of transfer size across all 6 elements
+				// 3. Be in the same 512mb region, this is because this case is not expected to be broken usually and we need to ensure MMIO is not involved in any of the transfers (assumes MMIO to be so rare that this is the last check)
+				if (ored == anded && items[0].ea < RAW_SPU_BASE_ADDR && items[1].ea < RAW_SPU_BASE_ADDR)
+				{
+					// Execute the postponed byteswapping and masking
+					s_size = std::bit_cast<be_t<u32>>(s_size) & ts_mask;
+
+					u8* src = vm::_ptr<u8>(0);
+					u8* dst = this->ls + arg_lsa;
+
+					// Assume success, prepare the next elements
+					arg_lsa += fetch_size * utils::align<u32>(s_size, 16);
+					item_ptr += fetch_size;
+					arg_size -= fetch_size * 8;
+
+					// Type which is friendly for fused address calculations
+					constexpr usz _128 = 128;
+
+					// This whole function relies on many constraints to be met (crashes real MFC), we can a have minor optimization assuming EA alignment to be +16 with +16 byte transfers
+#define MOV_T(type, index, _ea) { const usz ea = _ea; *reinterpret_cast<type*>(dst + index * utils::align<u32>(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast<const type*>(src + ea); } void()
+#define MOV_128(index, ea) mov_rdata(*reinterpret_cast<decltype(rdata)*>(dst + index * _128), *reinterpret_cast<const decltype(rdata)*>(src + (ea)))
+
+					switch (s_size)
+					{
+					case 0:
+					{
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 1:
+					{
+						MOV_T(u8, 0, items[0].ea);
+						MOV_T(u8, 1, items[1].ea);
+						MOV_T(u8, 2, items[2].ea);
+						MOV_T(u8, 3, items[3].ea);
+						MOV_T(u8, 4, items[4].ea);
+						MOV_T(u8, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 2:
+					{
+						MOV_T(u16, 0, items[0].ea);
+						MOV_T(u16, 1, items[1].ea);
+						MOV_T(u16, 2, items[2].ea);
+						MOV_T(u16, 3, items[3].ea);
+						MOV_T(u16, 4, items[4].ea);
+						MOV_T(u16, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 4:
+					{
+						MOV_T(u32, 0, items[0].ea);
+						MOV_T(u32, 1, items[1].ea);
+						MOV_T(u32, 2, items[2].ea);
+						MOV_T(u32, 3, items[3].ea);
+						MOV_T(u32, 4, items[4].ea);
+						MOV_T(u32, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 8:
+					{
+						MOV_T(u64, 0, items[0].ea);
+						MOV_T(u64, 1, items[1].ea);
+						MOV_T(u64, 2, items[2].ea);
+						MOV_T(u64, 3, items[3].ea);
+						MOV_T(u64, 4, items[4].ea);
+						MOV_T(u64, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 16:
+					{
+						MOV_T(v128, 0, items[0].ea);
+						MOV_T(v128, 1, items[1].ea);
+						MOV_T(v128, 2, items[2].ea);
+						MOV_T(v128, 3, items[3].ea);
+						MOV_T(v128, 4, items[4].ea);
+						MOV_T(v128, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 32:
+					{
+						struct mem
+						{
+							v128 a[2];
+						};
+
+						MOV_T(mem, 0, items[0].ea);
+						MOV_T(mem, 1, items[1].ea);
+						MOV_T(mem, 2, items[2].ea);
+						MOV_T(mem, 3, items[3].ea);
+						MOV_T(mem, 4, items[4].ea);
+						MOV_T(mem, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 48:
+					{
+						struct mem
+						{
+							v128 a[3];
+						};
+
+						MOV_T(mem, 0, items[0].ea);
+						MOV_T(mem, 1, items[1].ea);
+						MOV_T(mem, 2, items[2].ea);
+						MOV_T(mem, 3, items[3].ea);
+						MOV_T(mem, 4, items[4].ea);
+						MOV_T(mem, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 64:
+					{
+						struct mem
+						{
+							v128 a[4];
+						};
+
+						// TODO: Optimize (4 16-bytes movings is bad)
+						MOV_T(mem, 0, items[0].ea);
+						MOV_T(mem, 1, items[1].ea);
+						MOV_T(mem, 2, items[2].ea);
+						MOV_T(mem, 3, items[3].ea);
+						MOV_T(mem, 4, items[4].ea);
+						MOV_T(mem, 5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 128:
+					{
+						MOV_128(0, items[0].ea);
+						MOV_128(1, items[1].ea);
+						MOV_128(2, items[2].ea);
+						MOV_128(3, items[3].ea);
+						MOV_128(4, items[4].ea);
+						MOV_128(5, items[5].ea);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 256:
+					{
+						const usz ea0 = items[0].ea;
+						MOV_128(0, ea0 + 0);
+						MOV_128(1, ea0 + _128);
+						const usz ea1 = items[1].ea;
+						MOV_128(2, ea1 + 0);
+						MOV_128(3, ea1 + _128);
+						const usz ea2 = items[2].ea;
+						MOV_128(4, ea2 + 0);
+						MOV_128(5, ea2 + _128);
+						const usz ea3 = items[3].ea;
+						MOV_128(6, ea3 + 0);
+						MOV_128(7, ea3 + _128);
+						const usz ea4 = items[4].ea;
+						MOV_128(8, ea4 + 0);
+						MOV_128(9, ea4 + _128);
+						const usz ea5 = items[5].ea;
+						MOV_128(10, ea5 + 0);
+						MOV_128(11, ea5 + _128);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					case 512:
+					{
+						const usz ea0 = items[0].ea;
+						MOV_128(0 , ea0 + _128 * 0);
+						MOV_128(1 , ea0 + _128 * 1);
+						MOV_128(2 , ea0 + _128 * 2);
+						MOV_128(3 , ea0 + _128 * 3);
+						const usz ea1 = items[1].ea;
+						MOV_128(4 , ea1 + _128 * 0);
+						MOV_128(5 , ea1 + _128 * 1);
+						MOV_128(6 , ea1 + _128 * 2);
+						MOV_128(7 , ea1 + _128 * 3);
+						const usz ea2 = items[2].ea;
+						MOV_128(8 , ea2 + _128 * 0);
+						MOV_128(9 , ea2 + _128 * 1);
+						MOV_128(10, ea2 + _128 * 2);
+						MOV_128(11, ea2 + _128 * 3);
+						const usz ea3 = items[3].ea;
+						MOV_128(12, ea3 + _128 * 0);
+						MOV_128(13, ea3 + _128 * 1);
+						MOV_128(14, ea3 + _128 * 2);
+						MOV_128(15, ea3 + _128 * 3);
+						const usz ea4 = items[4].ea;
+						MOV_128(16, ea4 + _128 * 0);
+						MOV_128(17, ea4 + _128 * 1);
+						MOV_128(18, ea4 + _128 * 2);
+						MOV_128(19, ea4 + _128 * 3);
+						const usz ea5 = items[5].ea;
+						MOV_128(20, ea5 + _128 * 0);
+						MOV_128(21, ea5 + _128 * 1);
+						MOV_128(22, ea5 + _128 * 2);
+						MOV_128(23, ea5 + _128 * 3);
+
+						if (!arg_size)
+						{
+							return true;
+						}
+
+						continue;
+					}
+					default:
+					{
+						// TODO: Are more cases common enough? (in the range of less than 512 bytes because for more than that the optimization is doubtful)
+						break;
+					}
+					}
+#undef MOV_T
+#undef MOV_128
+					// Optimization miss, revert changes
+					arg_lsa -= fetch_size * utils::align<u32>(s_size, 16);
+					item_ptr -= fetch_size;
+					arg_size += fetch_size * 8;
+				}
+			}
+
 			// Reset to elements array head
 			index = 0;
-
-			const auto src = _ptr<const void>(args.eal);
-			const v128 data0 = v128::loadu(src, 0);
-			const v128 data1 = v128::loadu(src, 1);
-			const v128 data2 = v128::loadu(src, 2);
-
-			reinterpret_cast<v128*>(bufitems)[0] = data0;
-			reinterpret_cast<v128*>(bufitems)[1] = data1;
-			reinterpret_cast<v128*>(bufitems)[2] = data2;
 		}

-		const u32 size = items[index].ts & 0x7fff;
+		const u32 size = items[index].ts & ts_mask;
 		const u32 addr = items[index].ea;

-		spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast<be_t<u64>>(items[index]), args.lsa | (addr & 0xf));
-
-		if (size)
+		auto check_carry_16 = [](u16 addr, u16 size)
 		{
+#ifdef _MSC_VER
+			u16 out;
+			return _addcarry_u16(0, addr, size - 1, &out);
+#else
+			return ((addr + size - 1) >> 16) != 0;
+#endif
+		};
+
+		// Try to inline the transfer
+		if (addr < RAW_SPU_BASE_ADDR && size && optimization_compatible == MFC_GET_CMD)
+		{
+			const u8* src = vm::_ptr<u8>(addr);
+			u8* dst = this->ls + arg_lsa + (addr & 0xf);
+
+			switch (u32 _size = size)
+			{
+			case 1:
+			{
+				*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
+				break;
+			}
+			case 2:
+			{
+				*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
+				break;
+			}
+			case 4:
+			{
+				*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
+				break;
+			}
+			case 8:
+			{
+				*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
+				break;
+			}
+			default:
+			{
+				if (_size > s_rep_movsb_threshold)
+				{
+					__movsb(dst, src, _size);
+				}
+				else
+				{
+					// Avoid unaligned stores in mov_rdata_avx
+					if (reinterpret_cast<u64>(dst) & 0x10)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+						dst += 16;
+						src += 16;
+						_size -= 16;
+					}
+
+					while (_size >= 128)
+					{
+						mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+
+						dst += 128;
+						src += 128;
+						_size -= 128;
+					}
+
+					while (_size)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+						dst += 16;
+						src += 16;
+						_size -= 16;
+					}
+				}
+
+				break;
+			}
+			}
+
+			arg_lsa += utils::align<u32>(size, 16);
+		}
+		// Avoid inlining huge transfers because it intentionally drops range lock unlock
+		else if (addr < RAW_SPU_BASE_ADDR && size - 1 <= 0x400 - 1 && optimization_compatible == MFC_PUT_CMD && !check_carry_16(static_cast<u16>(addr), static_cast<u16>(size)))
+		{
+			if (!g_use_rtm)
+			{
+				vm::range_lock(range_lock, addr & -128, utils::align<u32>(addr + size, 128) - (addr & -128));
+			}
+
+			u8* dst = vm::_ptr<u8>(addr);
+			const u8* src = this->ls + arg_lsa + (addr & 0xf);
+
+			switch (u32 _size = size)
+			{
+			case 1:
+			{
+				*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
+				break;
+			}
+			case 2:
+			{
+				*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
+				break;
+			}
+			case 4:
+			{
+				*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
+				break;
+			}
+			case 8:
+			{
+				*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
+				break;
+			}
+			default:
+			{
+				if (_size > s_rep_movsb_threshold)
+				{
+					__movsb(dst, src, _size);
+				}
+				else
+				{
+					// Avoid unaligned stores in mov_rdata_avx
+					if (reinterpret_cast<u64>(dst) & 0x10)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+						dst += 16;
+						src += 16;
+						_size -= 16;
+					}
+
+					while (_size >= 128)
+					{
+						mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+
+						dst += 128;
+						src += 128;
+						_size -= 128;
+					}
+
+					while (_size)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+						dst += 16;
+						src += 16;
+						_size -= 16;
+					}
+				}
+
+				break;
+			}
+			}
+
+			arg_lsa += utils::align<u32>(size, 16);
+		}
+		else if (size)
+		{
+			range_lock->release(0);
+			spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast<be_t<u64>>(items[index]), arg_lsa | (addr & 0xf));
+
 			transfer.eal  = addr;
-			transfer.lsa  = args.lsa | (addr & 0xf);
+			transfer.lsa  = arg_lsa | (addr & 0xf);
 			transfer.size = size;

+			arg_lsa += utils::align<u32>(size, 16);
 			do_dma_transfer(this, transfer, ls);
-			const u32 add_size = std::max<u32>(size, 16);
-			args.lsa += add_size;
 		}

-		args.size -= 8;
+		arg_size -= 8;

-		if (!args.size)
+		if (!arg_size)
 		{
 			// No more elements
 			break;
 		}

-		args.eal += 8;
+		item_ptr++;

-		if (items[index].sb & 0x8000) [[unlikely]]
+		if (items[index].sb & 0x80) [[unlikely]]
 		{
+			range_lock->release(0);
+
 			ch_stall_mask |= utils::rol32(1, args.tag);

 			if (!ch_stall_stat.get_count())
@ -2771,12 +3232,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 			ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value());

 			args.tag |= 0x80; // Set stalled status
+			args.eal = reinterpret_cast<const u8*>(item_ptr) - this->ls;
+			args.lsa = arg_lsa;
+			args.size = arg_size;
 			return false;
 		}

 		index++;
 	}

+	range_lock->release(0);
 	return true;
 }

--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -159,19 +159,17 @@ namespace vm

 	void range_lock_internal(atomic_t<u64, 64>* range_lock, u32 begin, u32 size)
 	{
-		perf_meter<"RHW_LOCK"_u64> perf0;
+		perf_meter<"RHW_LOCK"_u64> perf0(0);

-		auto _cpu = get_current_cpu_thread();
+		cpu_thread* _cpu = nullptr;

-		if (_cpu)
+		if (u64 to_store = begin | (u64{size} << 32); *range_lock != to_store)
 		{
-			_cpu->state += cpu_flag::wait + cpu_flag::temp;
+			range_lock->store(to_store);
 		}

 		for (u64 i = 0;; i++)
 		{
-			range_lock->store(begin | (u64{size} << 32));
-
 			const u64 lock_val = g_range_lock.load();
 			const u64 is_share = g_shmem[begin >> 16].load();

@ -215,6 +213,11 @@ namespace vm
 				{
 					range_lock->release(0);

+					if (!perf0)
+					{
+						perf0.restart();
+					}
+
 					// Try triggering a page fault (write)
 					// TODO: Read memory if needed
 					vm::_ref<atomic_t<u8>>(test / 4096 == begin / 4096 ? begin : test) += 0;
@ -223,8 +226,26 @@ namespace vm
 			}

 			// Wait a bit before accessing global lock
-			range_lock->store(0);
+			range_lock->release(0);
+
+			if (!perf0)
+			{
+				perf0.restart();
+			}
+
 			busy_wait(200);
+
+			if (i >= 2 && !_cpu)
+			{
+				_cpu = cpu_thread::get_current();
+
+				if (_cpu)
+				{
+					_cpu->state += cpu_flag::wait + cpu_flag::temp;
+				}
+			}
+
+			range_lock->store(begin | (u64{size} << 32));
 		}

 		if (_cpu)
--- a/rpcs3/Emu/Memory/vm_locking.h
+++ b/rpcs3/Emu/Memory/vm_locking.h
@ -57,43 +57,15 @@ namespace vm
 			return;
 		}

-		const u64 lock_val = g_range_lock.load();
-		const u64 is_share = g_shmem[begin >> 16].load();
-
 		#ifndef _MSC_VER
 		__asm__(""); // Tiny barrier
 		#endif

-		u64 lock_addr = static_cast<u32>(lock_val); // -> u64
-		u32 lock_size = static_cast<u32>(lock_val << range_bits >> (32 + range_bits));
-
-		u64 addr = begin;
-
-		// Optimization: if range_locked is not used, the addr check will always pass
-		// Otherwise, g_shmem is unchanged and its value is reliable to read
-		if ((lock_val >> range_pos) == (range_locked >> range_pos))
+		if (!g_range_lock)
 		{
-			lock_size = 128;
-
-			if (is_share) [[unlikely]]
-			{
-				addr = static_cast<u16>(begin) | is_share;
-				lock_addr = lock_val;
-			}
+			return;
 		}

-		if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]]
-		{
-			const u64 new_lock_val = g_range_lock.load();
-
-			if (!new_lock_val || new_lock_val == lock_val) [[likely]]
-			{
-				return;
-			}
-		}
-
-		range_lock->release(0);
-
 		// Fallback to slow path
 		range_lock_internal(range_lock, begin, size);
 	}
--- a/rpcs3/Emu/perf_meter.hpp
+++ b/rpcs3/Emu/perf_meter.hpp
@ -93,6 +93,16 @@ public:
 		restart();
 	}

+	FORCE_INLINE SAFE_BUFFERS() perf_meter(int) noexcept
+	{
+		std::fill(std::begin(m_timestamps), std::end(m_timestamps), 0);
+	}
+
+	FORCE_INLINE SAFE_BUFFERS(operator bool) () const noexcept
+	{
+		return m_timestamps[0] != 0;
+	}
+
 	// Copy first timestamp
 	template <auto SN, auto... S>
 	FORCE_INLINE SAFE_BUFFERS() perf_meter(const perf_meter<SN, S...>& r) noexcept