SPU LLVM: add SPU profiling to compilation thread

Run another thread to collect profile data from SPU threads. Use this data to prioritize compiling hot spot SPU blocks. Implement stx::init_mutex::wait_for_initialized() helper.
2025-02-19 03:39:54 +00:00 · 2019-11-11 01:10:23 +03:00 · 2019-11-11 01:10:23 +03:00 · abbf3c4d16
commit abbf3c4d16
parent a21d7def33
2 changed files with 128 additions and 26 deletions
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -7,6 +7,7 @@
 #include "Utilities/StrUtil.h"
 #include "Utilities/JIT.h"
 #include "Utilities/sysinfo.h"
+#include "util/init_mutex.hpp"

 #include "SPUThread.h"
 #include "SPUAnalyser.h"
@ -8266,7 +8267,7 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
 struct spu_llvm
 {
 	// Workload
-	lf_queue<spu_item*> registered;
+	lf_queue<std::pair<const u64, spu_item*>> registered;

 	void operator()()
 	{
@ -8277,22 +8278,96 @@ struct spu_llvm
 		// Fake LS
 		std::vector<be_t<u32>> ls(0x10000);

-		for (auto* parg : registered)
+		// To compile (hash -> item)
+		std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
+
+		// Mini-profiler (hash -> number of occurrences)
+		std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
+
+		// For synchronization with profiler thread
+		stx::init_mutex prof_mutex;
+
+		named_thread profiler("SPU LLVM Profiler"sv, [&]()
 		{
-			if (thread_ctrl::state() == thread_state::aborting)
+			while (thread_ctrl::state() != thread_state::aborting)
 			{
-				break;
+				{
+					// Lock if enabled
+					const auto lock = prof_mutex.access();
+
+					if (!lock)
+					{
+						// Wait when the profiler is disabled
+						prof_mutex.wait_for_initialized();
+						continue;
+					}
+
+					// Collect profiling samples
+					idm::select<named_thread<spu_thread>>([&](u32 id, spu_thread& spu)
+					{
+						const u64 name = atomic_storage<u64>::load(spu.block_hash);
+
+						if (!(spu.state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause)))
+						{
+							const auto found = std::as_const(samples).find(spu.block_hash);
+
+							if (found != std::as_const(samples).end())
+							{
+								const_cast<atomic_t<u64>&>(found->second)++;
+							}
+						}
+					});
+				}
+
+				// Sleep for a short period if enabled
+				thread_ctrl::wait_for(20, false);
+			}
+		});
+
+		while (thread_ctrl::state() != thread_state::aborting)
+		{
+			for (const auto& pair : registered.pop_all())
+			{
+				enqueued.emplace(pair);
+
+				// Interrupt and kick profiler thread
+				const auto lock = prof_mutex.init_always([&]{});
+
+				// Register new blocks to collect samples
+				samples.emplace(pair.first, 0);
 			}

-			if (!parg)
+			if (enqueued.empty())
 			{
+				// Interrupt profiler thread and put it to sleep
+				static_cast<void>(prof_mutex.reset());
+				registered.wait();
 				continue;
 			}

-			const std::vector<u32>& func = (*parg)->data;
+			// Find the most used enqueued item
+			u64 sample_max = 0;
+			auto found_it  = enqueued.begin();
+
+			for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
+			{
+				const u64 cur = std::as_const(samples).at(it->first);
+
+				if (cur > sample_max)
+				{
+					sample_max = cur;
+					found_it = it;
+				}
+			}
+
+			// Start compiling
+			const std::vector<u32>& func = found_it->second->data;

 			// Old function pointer (pre-recompiled)
-			const spu_function_t _old = (*parg)->compiled;
+			const spu_function_t _old = found_it->second->compiled;
+
+			// Remove item from the queue
+			enqueued.erase(found_it);

 			// Get data start
 			const u32 start = func[0];
@ -8381,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
 		}

 		// Allocate executable area with necessary size
-		const auto result = jit_runtime::alloc(16 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
+		const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);

 		if (!result)
 		{
@ -8391,18 +8466,42 @@ struct spu_fast : public spu_recompiler_base
 		m_pos = func[0];
 		m_size = (::size32(func) - 1) * 4;

+		{
+			sha1_context ctx;
+			u8 output[20];
+
+			sha1_starts(&ctx);
+			sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
+			sha1_finish(&ctx, output);
+
+			be_t<u64> hash_start;
+			std::memcpy(&hash_start, output, sizeof(hash_start));
+			m_hash_start = hash_start;
+		}
+
 		u8* raw = result;

-		// 8-byte intruction for patching
-		// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xffff
+		// 8-byte intruction for patching (long NOP)
+		*raw++ = 0x0f;
+		*raw++ = 0x1f;
+		*raw++ = 0x84;
+		*raw++ = 0;
+		*raw++ = 0;
+		*raw++ = 0;
+		*raw++ = 0;
+		*raw++ = 0;
+
+		// mov rax, m_hash_start
+		*raw++ = 0x48;
+		*raw++ = 0xb8;
+		std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
+		raw += 8;
+
+		// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
 		*raw++ = 0x49;
-		*raw++ = 0xc7;
+		*raw++ = 0x89;
 		*raw++ = 0x45;
 		*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
-		*raw++ = 0xff;
-		*raw++ = 0xff;
-		*raw++ = 0x00;
-		*raw++ = 0x00;

 		// Load PC: mov eax, [r13 + spu_thread::pc]
 		*raw++ = 0x41;
@ -8445,16 +8544,6 @@ struct spu_fast : public spu_recompiler_base
 		// trap
 		//*raw++ = 0xcc;

-		// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xfffe
-		*raw++ = 0x49;
-		*raw++ = 0xc7;
-		*raw++ = 0x45;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
-		*raw++ = 0xfe;
-		*raw++ = 0xff;
-		*raw++ = 0x00;
-		*raw++ = 0x00;
-
 		// Secondary prologue: sub rsp,0x28
 		*raw++ = 0x48;
 		*raw++ = 0x83;
@ -8667,7 +8756,7 @@ struct spu_fast : public spu_recompiler_base
 		if (added)
 		{
 			// Send work to LLVM compiler thread
-			g_fxo->get<spu_llvm_thread>()->registered.push(add_loc);
+			g_fxo->get<spu_llvm_thread>()->registered.push(m_hash_start, add_loc);
 		}

 		// Rebuild trampoline if necessary
--- a/rpcs3/util/init_mutex.hpp
+++ b/rpcs3/util/init_mutex.hpp
@ -262,5 +262,18 @@ namespace stx
 		{
 			return (m_state & c_init_bit) != 0;
 		}
+
+		// Wait for access()
+		void wait_for_initialized() const noexcept
+		{
+			const u32 state = m_state;
+
+			if (state & c_init_bit)
+			{
+				return;
+			}
+
+			m_state.wait(state);
+		}
 	};
 }