SPU Re: more precise jt generation

Improve analyser, set v1 Fix branch indirect conditional
2025-01-27 21:35:19 +00:00 · 2018-05-10 19:38:07 +03:00 · 2018-05-10 19:38:07 +03:00 · be5c18cc85
commit be5c18cc85
parent 737db90058
4 changed files with 190 additions and 68 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -1156,27 +1156,36 @@ void spu_recompiler::branch_fixed(u32 target)
 	c->jmp(x86::rax);
 }

-void spu_recompiler::branch_indirect(spu_opcode_t op)
+void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt)
 {
 	using namespace asmjit;

-	if (!instr_table.isValid())
+	if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !jt)
 	{
-		// Request instruction table
-		instr_table = c->newLabel();
+		// Simply external call (return or indirect call)
+		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+		c->xor_(qw0->r32(), qw0->r32());
 	}
+	else
+	{
+		if (!instr_table.isValid())
+		{
+			// Request instruction table
+			instr_table = c->newLabel();
+		}

-	const u32 start = instr_labels.begin()->first;
-	const u32 end = instr_labels.rbegin()->first + 4;
+		const u32 start = instr_labels.begin()->first;
+		const u32 end = instr_labels.rbegin()->first + 4;

-	// Load indirect jump address, choose between local and external
-	c->lea(x86::r10, x86::qword_ptr(instr_table));
-	c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
-	c->xor_(qw0->r32(), qw0->r32());
-	c->cmp(qw1->r32(), end - start);
-	c->cmovae(qw1->r32(), qw0->r32());
-	c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
-	c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+		// Load indirect jump address, choose between local and external
+		c->lea(x86::r10, x86::qword_ptr(instr_table));
+		c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
+		c->xor_(qw0->r32(), qw0->r32());
+		c->cmp(qw1->r32(), end - start);
+		c->cmovae(qw1->r32(), qw0->r32());
+		c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
+		c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+	}

 	if (op.d)
 	{
@ -2741,7 +2750,7 @@ void spu_recompiler::BI(spu_opcode_t op)
 {
 	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
 	c->and_(*addr, 0x3fffc);
-	branch_indirect(op);
+	branch_indirect(op, verify(HERE, m_targets[m_pos].size()) > 2);
 	m_pos = -1;
 }

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -107,7 +107,7 @@ private:
 	asmjit::X86Mem XmmConst(__m128i data);

 	void branch_fixed(u32 target);
-	void branch_indirect(spu_opcode_t op);
+	void branch_indirect(spu_opcode_t op, bool jt = false);
 	void fall(spu_opcode_t op);
 	void save_rcx();
 	void load_rcx();
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -89,7 +89,7 @@ void spu_cache::initialize()
 	}

 	// SPU cache file (version + block size type)
-	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat";
+	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1.dat";

 	auto cache = std::make_shared<spu_cache>(loc);

@ -272,14 +272,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 	result.push_back(lsa);

 	// Initialize block entries
-	std::bitset<0x10000>& blocks = m_block_info;
-	blocks.reset();
-	blocks.set(lsa / 4);
+	m_block_info.reset();
+	m_block_info.set(lsa / 4);

 	// Simple block entry workload list
 	std::vector<u32> wl;
 	wl.push_back(lsa);

+	m_regmod.fill(0xff);
+	m_targets.clear();
+
 	// Value flags (TODO)
 	enum class vf : u32
 	{
@ -310,9 +312,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (target > lsa)
 			{
 				// Check for redundancy
-				if (!blocks[target / 4])
+				if (!m_block_info[target / 4])
 				{
-					blocks[target / 4] = true;
+					m_block_info[target / 4] = true;
 					wl.push_back(target);
 					return;
 				}
@ -325,6 +327,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)

 		wl[wi] += 4;

+		m_targets.erase(pos);
+
 		// Analyse instruction
 		switch (const auto type = s_spu_itype.decode(data))
 		{
@ -336,7 +340,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::DFTSV:
 		{
 			// Stop before invalid instructions (TODO)
-			blocks[pos / 4] = true;
+			m_targets[pos].push_back(-1);
+			m_block_info[pos / 4] = true;
 			next_block();
 			continue;
 		}
@ -349,7 +354,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (data == 0 || data == 3)
 			{
 				// Stop before null data
-				blocks[pos / 4] = true;
+				m_targets[pos].push_back(-1);
+				m_block_info[pos / 4] = true;
 				next_block();
 				continue;
 			}
@ -357,6 +363,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
 			{
 				// Stop on special instructions (TODO)
+				m_targets[pos].push_back(-1);
 				next_block();
 				break;
 			}
@ -366,6 +373,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)

 		case spu_itype::IRET:
 		{
+			m_targets[pos].push_back(-1);
 			next_block();
 			break;
 		}
@ -382,6 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)

 			if (type == spu_itype::BISL)
 			{
+				m_regmod[pos / 4] = op.rt;
 				vflags[op.rt] = +vf::is_const;
 				values[op.rt] = pos + 4;
 			}
@ -389,23 +398,24 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (test(af, vf::is_const))
 			{
 				const u32 target = spu_branch_target(av);
+				LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);

 				if (target == pos + 4)
 				{
 					// Nop (unless BISL)
-					break;
+					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos);
 				}

+				m_targets[pos].push_back(target);
+
 				if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
-					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
 					add_block(target);
 				}

-				if (type == spu_itype::BISL && target < lsa)
+				if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
-					next_block();
-					break;
+					add_block(pos + 4);
 				}
 			}
 			else if (type == spu_itype::BI && !op.d && !op.e)
@ -488,6 +498,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 							add_block(jt_abs[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
 						}
+
+						m_targets.emplace(pos, std::move(jt_abs));
 					}

 					if (jt_rel.size() >= jt_abs.size())
@ -504,19 +516,33 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 							add_block(jt_rel[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
 						}
+
+						m_targets.emplace(pos, std::move(jt_rel));
 					}
 				}
 			}

-			if (type == spu_itype::BI || type == spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::safe)
+			if (type == spu_itype::BI || type == spu_itype::BISL)
 			{
 				if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
 				{
-					next_block();
-					break;
+					if (m_targets[pos].empty())
+					{
+						m_targets[pos].push_back(-1);
+					}
+				}
+				else
+				{
+					add_block(pos + 4);
 				}
 			}
+			else
+			{
+				m_targets[pos].push_back(pos + 4);
+				add_block(pos + 4);
+			}

+			next_block();
 			break;
 		}

@ -525,6 +551,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		{
 			const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);

+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = pos + 4;

@ -534,11 +561,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				break;
 			}

-			if (target < lsa || g_cfg.core.spu_block_size != spu_block_size_type::giga)
+			m_targets[pos].push_back(target);
+
+			if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
 			{
-				// Stop on direct calls
-				next_block();
-				break;
+				add_block(pos + 4);
 			}

 			if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -546,6 +573,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				add_block(target);
 			}

+			next_block();
 			break;
 		}

@ -564,15 +592,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				break;
 			}

+			m_targets[pos].push_back(target);
 			add_block(target);

-			if (type == spu_itype::BR || type == spu_itype::BRA)
+			if (type != spu_itype::BR && type != spu_itype::BRA)
 			{
-				// Stop on direct branches
-				next_block();
-				break;
+				m_targets[pos].push_back(pos + 4);
+				add_block(pos + 4);
 			}

+			next_block();
 			break;
 		}

@ -601,61 +630,131 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)

 		case spu_itype::IL:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.si16;
 			break;
 		}
 		case spu_itype::ILA:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i18;
 			break;
 		}
 		case spu_itype::ILH:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i16 << 16 | op.i16;
 			break;
 		}
 		case spu_itype::ILHU:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i16 << 16;
 			break;
 		}
 		case spu_itype::IOHL:
 		{
+			m_regmod[pos / 4] = op.rt;
 			values[op.rt] = values[op.rt] | op.i16;
 			break;
 		}
 		case spu_itype::ORI:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] | op.si10;
 			break;
 		}
 		case spu_itype::OR:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.ra] | values[op.rb];
 			break;
 		}
+		case spu_itype::ANDI:
+		{
+			m_regmod[pos / 4] = op.rt;
+			vflags[op.rt] = vflags[op.ra] & vf::is_const;
+			values[op.rt] = values[op.ra] & op.si10;
+			break;
+		}
+		case spu_itype::AND:
+		{
+			m_regmod[pos / 4] = op.rt;
+			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
+			values[op.rt] = values[op.ra] & values[op.rb];
+			break;
+		}
 		case spu_itype::AI:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] + op.si10;
 			break;
 		}
 		case spu_itype::A:
 		{
+			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.ra] + values[op.rb];
 			break;
 		}
+		case spu_itype::SFI:
+		{
+			m_regmod[pos / 4] = op.rt;
+			vflags[op.rt] = vflags[op.ra] & vf::is_const;
+			values[op.rt] = op.si10 - values[op.ra];
+			break;
+		}
+		case spu_itype::SF:
+		{
+			m_regmod[pos / 4] = op.rt;
+			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
+			values[op.rt] = values[op.rb] - values[op.ra];
+			break;
+		}
+		case spu_itype::ROTMI:
+		{
+			m_regmod[pos / 4] = op.rt;
+
+			if (-op.i7 & 0x20)
+			{
+				vflags[op.rt] = +vf::is_const;
+				values[op.rt] = 0;
+				break;
+			}
+
+			vflags[op.rt] = vflags[op.ra] & vf::is_const;
+			values[op.rt] = values[op.ra] >> (-op.i7 & 0x1f);
+			break;
+		}
+		case spu_itype::SHLI:
+		{
+			m_regmod[pos / 4] = op.rt;
+
+			if (op.i7 & 0x20)
+			{
+				vflags[op.rt] = +vf::is_const;
+				values[op.rt] = 0;
+				break;
+			}
+
+			vflags[op.rt] = vflags[op.ra] & vf::is_const;
+			values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
+			break;
+		}
+
 		default:
 		{
 			// Unconst
-			vflags[type & spu_itype::_quadrop ? +op.rt4 : +op.rt] = {};
+			const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
+			m_regmod[pos / 4] = op_rt;
+			vflags[op_rt] = {};
 			break;
 		}
 		}
@ -783,7 +882,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	llvm::Value* m_lsptr;

 	llvm::BasicBlock* m_stop;
-	llvm::GlobalVariable* m_jt;

 	std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr;
 	std::array<llvm::Instruction*, 128> m_flush_gpr;
@ -1047,27 +1145,15 @@ public:

 		m_stop = BasicBlock::Create(m_context, "", m_function);

-		const auto jtt = ArrayType::get(GetType<u8*>(), m_size / 4);
-		std::vector<llvm::Constant*> jt;
-		jt.reserve(m_size / 4);
-
 		// Create instruction blocks
 		for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4)
 		{
 			if (func[i] && m_block_info[pos / 4])
 			{
-				const auto b = BasicBlock::Create(m_context, "", m_function);
-				jt.push_back(llvm::BlockAddress::get(b));
-				m_instr_map.emplace(pos, b);
-			}
-			else
-			{
-				jt.push_back(llvm::BlockAddress::get(m_stop));
+				m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function));
 			}
 		}

-		m_jt = new GlobalVariable(*module, jtt, true, GlobalValue::PrivateLinkage, llvm::ConstantArray::get(jtt, jt), "jt");
-
 		update_pc();

 		const auto label_test = BasicBlock::Create(m_context, "", m_function);
@ -2764,24 +2850,43 @@ public:
 			addr.value = call(&exec_check_interrupts, m_thread, addr.value);
 		}

-		if (llvm::isa<llvm::ConstantInt>(addr.value))
+		if (const auto _int = llvm::dyn_cast<llvm::ConstantInt>(addr.value))
 		{
-			return branch_fixed(llvm::cast<llvm::ConstantInt>(addr.value)->getZExtValue());
+			LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue());
+			return branch_fixed(_int->getZExtValue());
 		}

 		m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));

-		const u32 start = m_instr_map.begin()->first;
-		const auto local = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto off = m_ir->CreateSub(addr.value, m_ir->getInt32(start));
-		m_ir->CreateCondBr(m_ir->CreateICmpULT(off, m_ir->getInt32(m_size)), local, exter);
-		m_ir->SetInsertPoint(local);
-		const auto table = m_ir->CreateIndirectBr(m_ir->CreateLoad(m_ir->CreateGEP(m_jt, {(llvm::Value*)m_ir->getInt32(0), m_ir->CreateLShr(off, 2)})), m_instr_map.size() + 1);
-		for (const auto& pair : m_instr_map)
-			table->addDestination(pair.second);
-		table->addDestination(m_stop);
-		m_ir->SetInsertPoint(exter);
+		const auto tfound = m_targets.find(m_pos);
+
+		if (tfound != m_targets.end() && tfound->second.size() >= 3)
+		{
+			const u32 start = m_instr_map.begin()->first;
+
+			const std::set<u32> targets(tfound->second.begin(), tfound->second.end());
+
+			const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
+
+			const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4);
+
+			for (u32 pos = start; pos < start + m_size; pos += 4)
+			{
+				const auto found = m_instr_map.find(pos);
+
+				if (found != m_instr_map.end() && targets.count(pos))
+				{
+					sw->addCase(m_ir->getInt32(pos / 4), found->second);
+				}
+				else
+				{
+					sw->addCase(m_ir->getInt32(pos / 4), m_stop);
+				}
+			}
+
+			m_ir->SetInsertPoint(exter);
+		}
+
 		const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher)));
 		const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
 		tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type)));
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -5,6 +5,7 @@
 #include <vector>
 #include <bitset>
 #include <memory>
+#include <string>

 // Helper class
 class spu_cache
@ -35,8 +36,15 @@ protected:
 	u32 m_pos;
 	u32 m_size;

+	// Bit indicating start of the block
 	std::bitset<0x10000> m_block_info;

+	// GPR modified by the instruction (-1 = not set)
+	std::array<u8, 0x10000> m_regmod;
+
+	// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
+	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;
+
 	std::shared_ptr<spu_cache> m_cache;

 public: