1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2025-02-19 03:39:54 +00:00

SPU LLVM: add SPU profiling to compilation thread

Run another thread to collect profile data from SPU threads.
Use this data to prioritize compiling hot spot SPU blocks.
Implement stx::init_mutex::wait_for_initialized() helper.
This commit is contained in:
Nekotekina 2019-11-11 01:10:23 +03:00
parent a21d7def33
commit abbf3c4d16
2 changed files with 128 additions and 26 deletions

@ -7,6 +7,7 @@
#include "Utilities/StrUtil.h"
#include "Utilities/JIT.h"
#include "Utilities/sysinfo.h"
#include "util/init_mutex.hpp"
#include "SPUThread.h"
#include "SPUAnalyser.h"
@ -8266,7 +8267,7 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
struct spu_llvm
{
// Workload
lf_queue<spu_item*> registered;
lf_queue<std::pair<const u64, spu_item*>> registered;
void operator()()
{
@ -8277,22 +8278,96 @@ struct spu_llvm
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
for (auto* parg : registered)
// To compile (hash -> item)
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
// Mini-profiler (hash -> number of occurrences)
std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
// For synchronization with profiler thread
stx::init_mutex prof_mutex;
named_thread profiler("SPU LLVM Profiler"sv, [&]()
{
if (thread_ctrl::state() == thread_state::aborting)
while (thread_ctrl::state() != thread_state::aborting)
{
break;
{
// Lock if enabled
const auto lock = prof_mutex.access();
if (!lock)
{
// Wait when the profiler is disabled
prof_mutex.wait_for_initialized();
continue;
}
// Collect profiling samples
idm::select<named_thread<spu_thread>>([&](u32 id, spu_thread& spu)
{
const u64 name = atomic_storage<u64>::load(spu.block_hash);
if (!(spu.state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause)))
{
const auto found = std::as_const(samples).find(spu.block_hash);
if (found != std::as_const(samples).end())
{
const_cast<atomic_t<u64>&>(found->second)++;
}
}
});
}
// Sleep for a short period if enabled
thread_ctrl::wait_for(20, false);
}
});
while (thread_ctrl::state() != thread_state::aborting)
{
for (const auto& pair : registered.pop_all())
{
enqueued.emplace(pair);
// Interrupt and kick profiler thread
const auto lock = prof_mutex.init_always([&]{});
// Register new blocks to collect samples
samples.emplace(pair.first, 0);
}
if (!parg)
if (enqueued.empty())
{
// Interrupt profiler thread and put it to sleep
static_cast<void>(prof_mutex.reset());
registered.wait();
continue;
}
const std::vector<u32>& func = (*parg)->data;
// Find the most used enqueued item
u64 sample_max = 0;
auto found_it = enqueued.begin();
for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
{
const u64 cur = std::as_const(samples).at(it->first);
if (cur > sample_max)
{
sample_max = cur;
found_it = it;
}
}
// Start compiling
const std::vector<u32>& func = found_it->second->data;
// Old function pointer (pre-recompiled)
const spu_function_t _old = (*parg)->compiled;
const spu_function_t _old = found_it->second->compiled;
// Remove item from the queue
enqueued.erase(found_it);
// Get data start
const u32 start = func[0];
@ -8381,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
}
// Allocate executable area with necessary size
const auto result = jit_runtime::alloc(16 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
if (!result)
{
@ -8391,18 +8466,42 @@ struct spu_fast : public spu_recompiler_base
m_pos = func[0];
m_size = (::size32(func) - 1) * 4;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
sha1_finish(&ctx, output);
be_t<u64> hash_start;
std::memcpy(&hash_start, output, sizeof(hash_start));
m_hash_start = hash_start;
}
u8* raw = result;
// 8-byte intruction for patching
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xffff
// 8-byte intruction for patching (long NOP)
*raw++ = 0x0f;
*raw++ = 0x1f;
*raw++ = 0x84;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
// mov rax, m_hash_start
*raw++ = 0x48;
*raw++ = 0xb8;
std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
raw += 8;
// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
*raw++ = 0x49;
*raw++ = 0xc7;
*raw++ = 0x89;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
*raw++ = 0xff;
*raw++ = 0xff;
*raw++ = 0x00;
*raw++ = 0x00;
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
@ -8445,16 +8544,6 @@ struct spu_fast : public spu_recompiler_base
// trap
//*raw++ = 0xcc;
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xfffe
*raw++ = 0x49;
*raw++ = 0xc7;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
*raw++ = 0xfe;
*raw++ = 0xff;
*raw++ = 0x00;
*raw++ = 0x00;
// Secondary prologue: sub rsp,0x28
*raw++ = 0x48;
*raw++ = 0x83;
@ -8667,7 +8756,7 @@ struct spu_fast : public spu_recompiler_base
if (added)
{
// Send work to LLVM compiler thread
g_fxo->get<spu_llvm_thread>()->registered.push(add_loc);
g_fxo->get<spu_llvm_thread>()->registered.push(m_hash_start, add_loc);
}
// Rebuild trampoline if necessary

@ -262,5 +262,18 @@ namespace stx
{
return (m_state & c_init_bit) != 0;
}
// Wait for access()
void wait_for_initialized() const noexcept
{
const u32 state = m_state;
if (state & c_init_bit)
{
return;
}
m_state.wait(state);
}
};
}