SPU LLVM: LS Memory Mirrors (Optimize loads/stores)

2025-02-11 15:40:51 +00:00 · 2020-07-15 21:57:39 +03:00 · 2020-07-15 21:57:39 +03:00 · af1ceb1151
commit af1ceb1151
parent c1a80b8146
5 changed files with 104 additions and 31 deletions
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -173,7 +173,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
 	{
 	case MFC_LSA_offs:
 	{
-		if (value >= 0x40000)
+		if (value >= SPU_LS_SIZE)
 		{
 			break;
 		}
@ -321,7 +321,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)

 void spu_load_exec(const spu_exec_object& elf)
 {
-	auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu));
+	auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, SPU_LS_SIZE, vm::spu));
 	auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "", 0);

 	spu_thread::g_raw_spu_ctr++;
@ -331,7 +331,7 @@ void spu_load_exec(const spu_exec_object& elf)
 	{
 		if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz)
 		{
-			std::memcpy(vm::base(spu->offset + prog.p_vaddr), prog.bin.data(), prog.p_filesz);
+			std::memcpy(spu->_ptr<void>(prog.p_vaddr), prog.bin.data(), prog.p_filesz);
 		}
 	}

--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -272,8 +272,7 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp

 	// Tail call, GHC CC (second arg)
 	c.mov(x86::r13, args[0]);
-	c.mov(x86::ebp, x86::dword_ptr(args[0], ::offset32(&spu_thread::offset)));
-	c.add(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::memory_base_addr)));
+	c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
 	c.mov(x86::r12, args[2]);
 	c.xor_(x86::ebx, x86::ebx);
 	c.jmp(args[1]);
@ -1138,7 +1137,7 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
 	}

 	// Find function
-	const auto func = spu.jit->get_runtime().find(static_cast<u32*>(vm::base(spu.offset)), spu.pc);
+	const auto func = spu.jit->get_runtime().find(static_cast<u32*>(spu._ptr<void>(0)), spu.pc);

 	if (!func)
 	{
@ -7902,13 +7901,51 @@ public:

 	void STQX(spu_opcode_t op)
 	{
-		value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0));
+		const auto a = get_vr(op.ra);
+		const auto b = get_vr(op.rb);
+
+		for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
+		{
+			if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
+			{
+				v128 data = get_const_vector(cv, m_pos, 10000);
+				data._u32[3] %= SPU_LS_SIZE;
+
+				if (data._u32[3] % 0x10 == 0)
+				{
+					value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
+					make_store_ls(addr, get_vr<u8[16]>(op.rt));
+					return;
+				}
+			}
+		}
+
+		value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
 		make_store_ls(addr, get_vr<u8[16]>(op.rt));
 	}

 	void LQX(spu_opcode_t op)
 	{
-		value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0));
+		const auto a = get_vr(op.ra);
+		const auto b = get_vr(op.rb);
+
+		for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
+		{
+			if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
+			{
+				v128 data = get_const_vector(cv, m_pos, 10000);
+				data._u32[3] %= SPU_LS_SIZE;
+
+				if (data._u32[3] % 0x10 == 0)
+				{
+					value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
+					set_vr(op.rt, make_load_ls(addr));
+					return;
+				}
+			}
+		}
+
+		value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
 		set_vr(op.rt, make_load_ls(addr));
 	}

@ -7928,7 +7965,7 @@ public:
 	{
 		value_t<u64> addr;
 		addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
-		addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0);
+		addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
 		make_store_ls(addr, get_vr<u8[16]>(op.rt));
 	}

@ -7936,7 +7973,7 @@ public:
 	{
 		value_t<u64> addr;
 		addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
-		addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0);
+		addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
 		set_vr(op.rt, make_load_ls(addr));
 	}

@ -7953,13 +7990,13 @@ public:
 			}
 		}

-		value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0));
+		value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
 		make_store_ls(addr, get_vr<u8[16]>(op.rt));
 	}

 	void LQD(spu_opcode_t op)
 	{
-		value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0));
+		value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
 		set_vr(op.rt, make_load_ls(addr));
 	}

--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -2,6 +2,7 @@
 #include "Utilities/JIT.h"
 #include "Utilities/asm.h"
 #include "Utilities/sysinfo.h"
+#include "Emu/Memory/vm.h"
 #include "Emu/Memory/vm_ptr.h"
 #include "Emu/Memory/vm_reservation.h"

@ -1111,7 +1112,7 @@ void spu_thread::cpu_task()
 				continue;
 			}

-			spu_runtime::g_gateway(*this, vm::_ptr<u8>(offset), nullptr);
+			spu_runtime::g_gateway(*this, _ptr<u8>(0), nullptr);
 		}

 		// Print some stats
@ -1129,7 +1130,7 @@ void spu_thread::cpu_task()
 					break;
 			}

-			spu_runtime::g_interpreter(*this, vm::_ptr<u8>(offset), nullptr);
+			spu_runtime::g_interpreter(*this, _ptr<u8>(0), nullptr);
 		}
 	}

@ -1148,8 +1149,21 @@ void spu_thread::cpu_unmem()

 spu_thread::~spu_thread()
 {
-	// Deallocate Local Storage
-	vm::dealloc_verbose_nothrow(offset);
+	{
+		const auto [_, shm] = vm::get(vm::any, offset)->get(offset);
+
+		for (s32 i = -1; i < 2; i++)
+		{
+			// Unmap LS mirrors
+			shm->unmap_critical(ls + (i * SPU_LS_SIZE));
+		}
+
+		// Deallocate Local Storage
+		vm::dealloc_verbose_nothrow(offset);
+	}
+
+	// Release LS mirrors area
+	utils::memory_release(ls - SPU_LS_SIZE, SPU_LS_SIZE * 3);

 	// Deallocate RawSPU ID
 	if (!group && offset >= RAW_SPU_BASE_ADDR)
@ -1159,11 +1173,26 @@ spu_thread::~spu_thread()
 	}
 }

-spu_thread::spu_thread(vm::addr_t ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated)
+spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated)
 	: cpu_thread(idm::last_id())
 	, is_isolated(is_isolated)
 	, index(index)
-	, offset(ls)
+	, offset(_ls)
+	, ls([&]()
+	{
+		const auto [_, shm] = vm::get(vm::any, _ls)->get(_ls);
+		const auto addr = static_cast<u8*>(utils::memory_reserve(SPU_LS_SIZE * 3));
+
+		for (u32 i = 0; i < 3; i++)
+		{
+			// Map LS mirrors
+			const auto ptr = addr + (i * SPU_LS_SIZE);
+			verify(HERE), shm->map_critical(ptr) == ptr;
+		}
+
+		// Use the middle mirror
+		return addr + SPU_LS_SIZE;
+	}())
 	, group(group)
 	, lv2_id(lv2_id)
 	, spu_tname(stx::shared_cptr<std::string>::make(name))
@ -1233,7 +1262,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 			}

 			u32 value;
-			if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < 0x40000) // LS access
+			if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < SPU_LS_SIZE) // LS access
 			{
 			}
 			else if (args.size == 4 && is_get && thread->read_reg(eal, value))
@ -1258,7 +1287,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		{
 			auto& spu = static_cast<spu_thread&>(*group->threads[group->threads_map[index]]);

-			if (offset + args.size - 1 < 0x40000) // LS access
+			if (offset + args.size - 1 < SPU_LS_SIZE) // LS access
 			{
 				eal = spu.offset + offset; // redirect access
 			}
@ -1282,7 +1311,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	auto [dst, src] = [&]() -> std::pair<u8*, const u8*>
 	{
 		u8* dst = vm::_ptr<u8>(eal);
-		u8* src = vm::_ptr<u8>(offset + lsa);
+		u8* src = _ptr<u8>(lsa);

 		if (is_get)
 		{
@ -1638,6 +1667,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 	transfer.cmd  = MFC(args.cmd & ~MFC_LIST_MASK);

 	args.lsa &= 0x3fff0;
+	args.eal &= 0x3fff8;

 	u32 index = fetch_size;

@ -1650,7 +1680,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 			// Reset to elements array head
 			index = 0;

-			const auto src = _ptr<const void>(args.eal & 0x3fff8);
+			const auto src = _ptr<const void>(args.eal);
 			const v128 data0 = v128::loadu(src, 0);
 			const v128 data1 = v128::loadu(src, 1);
 			const v128 data2 = v128::loadu(src, 2);
@ -2947,7 +2977,7 @@ bool spu_thread::stop_and_signal(u32 code)
 		spu_log.warning("STOP 0x0");

 		// HACK: find an ILA instruction
-		for (u32 addr = pc; addr < 0x40000; addr += 4)
+		for (u32 addr = pc; addr < SPU_LS_SIZE; addr += 4)
 		{
 			const u32 instr = _ref<u32>(addr);

--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -118,6 +118,11 @@ enum : u32
 	SPU_STATUS_IS_ISOLATED         = 0x80,
 };

+enum : s32
+{
+	SPU_LS_SIZE = 0x40000,
+};
+
 enum : u32
 {
 	SYS_SPU_THREAD_BASE_LOW  = 0xf0000000,
@ -636,6 +641,7 @@ public:

 	const u32 index; // SPU index
 	const u32 offset; // SPU LS offset
+	const std::add_pointer_t<u8> ls; // SPU LS pointer 
 private:
 	lv2_spu_group* const group; // SPU Thread Group (only safe to access in the spu thread itself)
 public:
@ -682,7 +688,7 @@ public:
 	template<typename T>
 	inline to_be_t<T>* _ptr(u32 lsa)
 	{
-		return static_cast<to_be_t<T>*>(vm::base(offset + lsa));
+		return reinterpret_cast<to_be_t<T>*>(ls + lsa);
 	}

 	// Convert specified SPU LS address to a reference of specified (possibly converted to BE) type
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@ -397,7 +397,7 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
 		sys_spu.warning("Unimplemented SPU Thread options (0x%x)", option);
 	}

-	const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))};
+	const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(SPU_LS_SIZE, vm::main))};

 	const u32 inited = group->init;

@ -579,7 +579,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
 	if (type & SYS_SPU_THREAD_GROUP_TYPE_COOPERATE_WITH_SYSTEM)
 	{
 		// Constant size, unknown what it means but it's definitely not for each spu thread alone
-		mem_size = 0x40000;
+		mem_size = SPU_LS_SIZE;
 		use_scheduler = false;
 	}
 	else if (type & SYS_SPU_THREAD_GROUP_TYPE_NON_CONTEXT)
@ -591,7 +591,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
 	else
 	{
 		// 256kb for each spu thread, probably for saving and restoring SPU LS (used by scheduler?)
-		mem_size = 0x40000 * num;
+		mem_size = SPU_LS_SIZE * num;
 	}

 	if (num < min_threads || num > max_threads ||
@ -1225,7 +1225,7 @@ error_code sys_spu_thread_write_ls(ppu_thread& ppu, u32 id, u32 lsa, u64 value,

 	sys_spu.trace("sys_spu_thread_write_ls(id=0x%x, lsa=0x%05x, value=0x%llx, type=%d)", id, lsa, value, type);

-	if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
+	if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
 	{
 		return CELL_EINVAL;
 	}
@ -1268,7 +1268,7 @@ error_code sys_spu_thread_read_ls(ppu_thread& ppu, u32 id, u32 lsa, vm::ptr<u64>

 	sys_spu.trace("sys_spu_thread_read_ls(id=0x%x, lsa=0x%05x, value=*0x%x, type=%d)", id, lsa, value, type);

-	if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
+	if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
 	{
 		return CELL_EINVAL;
 	}
@ -1831,7 +1831,7 @@ error_code sys_raw_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<void> at
 			index = 0;
 	}

-	const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))};
+	const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};

 	const u32 tid = idm::make<named_thread<spu_thread>>(fmt::format("RawSPU[0x%x] ", index), ls_addr, nullptr, index, "", index);

@ -1879,7 +1879,7 @@ error_code sys_isolated_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<voi
 			index = 0;
 	}

-	const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))};
+	const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};

 	const auto thread = idm::make_ptr<named_thread<spu_thread>>(fmt::format("IsoSPU[0x%x] ", index), ls_addr, nullptr, index, "", index, true);