mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-02-19 03:39:54 +00:00
SPU LLVM: add SPU profiling to compilation thread
Run another thread to collect profile data from SPU threads. Use this data to prioritize compiling hot spot SPU blocks. Implement stx::init_mutex::wait_for_initialized() helper.
This commit is contained in:
parent
a21d7def33
commit
abbf3c4d16
rpcs3
@ -7,6 +7,7 @@
|
||||
#include "Utilities/StrUtil.h"
|
||||
#include "Utilities/JIT.h"
|
||||
#include "Utilities/sysinfo.h"
|
||||
#include "util/init_mutex.hpp"
|
||||
|
||||
#include "SPUThread.h"
|
||||
#include "SPUAnalyser.h"
|
||||
@ -8266,7 +8267,7 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
|
||||
struct spu_llvm
|
||||
{
|
||||
// Workload
|
||||
lf_queue<spu_item*> registered;
|
||||
lf_queue<std::pair<const u64, spu_item*>> registered;
|
||||
|
||||
void operator()()
|
||||
{
|
||||
@ -8277,22 +8278,96 @@ struct spu_llvm
|
||||
// Fake LS
|
||||
std::vector<be_t<u32>> ls(0x10000);
|
||||
|
||||
for (auto* parg : registered)
|
||||
// To compile (hash -> item)
|
||||
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
|
||||
|
||||
// Mini-profiler (hash -> number of occurrences)
|
||||
std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
|
||||
|
||||
// For synchronization with profiler thread
|
||||
stx::init_mutex prof_mutex;
|
||||
|
||||
named_thread profiler("SPU LLVM Profiler"sv, [&]()
|
||||
{
|
||||
if (thread_ctrl::state() == thread_state::aborting)
|
||||
while (thread_ctrl::state() != thread_state::aborting)
|
||||
{
|
||||
break;
|
||||
{
|
||||
// Lock if enabled
|
||||
const auto lock = prof_mutex.access();
|
||||
|
||||
if (!lock)
|
||||
{
|
||||
// Wait when the profiler is disabled
|
||||
prof_mutex.wait_for_initialized();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collect profiling samples
|
||||
idm::select<named_thread<spu_thread>>([&](u32 id, spu_thread& spu)
|
||||
{
|
||||
const u64 name = atomic_storage<u64>::load(spu.block_hash);
|
||||
|
||||
if (!(spu.state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause)))
|
||||
{
|
||||
const auto found = std::as_const(samples).find(spu.block_hash);
|
||||
|
||||
if (found != std::as_const(samples).end())
|
||||
{
|
||||
const_cast<atomic_t<u64>&>(found->second)++;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Sleep for a short period if enabled
|
||||
thread_ctrl::wait_for(20, false);
|
||||
}
|
||||
});
|
||||
|
||||
while (thread_ctrl::state() != thread_state::aborting)
|
||||
{
|
||||
for (const auto& pair : registered.pop_all())
|
||||
{
|
||||
enqueued.emplace(pair);
|
||||
|
||||
// Interrupt and kick profiler thread
|
||||
const auto lock = prof_mutex.init_always([&]{});
|
||||
|
||||
// Register new blocks to collect samples
|
||||
samples.emplace(pair.first, 0);
|
||||
}
|
||||
|
||||
if (!parg)
|
||||
if (enqueued.empty())
|
||||
{
|
||||
// Interrupt profiler thread and put it to sleep
|
||||
static_cast<void>(prof_mutex.reset());
|
||||
registered.wait();
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::vector<u32>& func = (*parg)->data;
|
||||
// Find the most used enqueued item
|
||||
u64 sample_max = 0;
|
||||
auto found_it = enqueued.begin();
|
||||
|
||||
for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
|
||||
{
|
||||
const u64 cur = std::as_const(samples).at(it->first);
|
||||
|
||||
if (cur > sample_max)
|
||||
{
|
||||
sample_max = cur;
|
||||
found_it = it;
|
||||
}
|
||||
}
|
||||
|
||||
// Start compiling
|
||||
const std::vector<u32>& func = found_it->second->data;
|
||||
|
||||
// Old function pointer (pre-recompiled)
|
||||
const spu_function_t _old = (*parg)->compiled;
|
||||
const spu_function_t _old = found_it->second->compiled;
|
||||
|
||||
// Remove item from the queue
|
||||
enqueued.erase(found_it);
|
||||
|
||||
// Get data start
|
||||
const u32 start = func[0];
|
||||
@ -8381,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
|
||||
}
|
||||
|
||||
// Allocate executable area with necessary size
|
||||
const auto result = jit_runtime::alloc(16 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
|
||||
const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
|
||||
|
||||
if (!result)
|
||||
{
|
||||
@ -8391,18 +8466,42 @@ struct spu_fast : public spu_recompiler_base
|
||||
m_pos = func[0];
|
||||
m_size = (::size32(func) - 1) * 4;
|
||||
|
||||
{
|
||||
sha1_context ctx;
|
||||
u8 output[20];
|
||||
|
||||
sha1_starts(&ctx);
|
||||
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
|
||||
sha1_finish(&ctx, output);
|
||||
|
||||
be_t<u64> hash_start;
|
||||
std::memcpy(&hash_start, output, sizeof(hash_start));
|
||||
m_hash_start = hash_start;
|
||||
}
|
||||
|
||||
u8* raw = result;
|
||||
|
||||
// 8-byte intruction for patching
|
||||
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xffff
|
||||
// 8-byte intruction for patching (long NOP)
|
||||
*raw++ = 0x0f;
|
||||
*raw++ = 0x1f;
|
||||
*raw++ = 0x84;
|
||||
*raw++ = 0;
|
||||
*raw++ = 0;
|
||||
*raw++ = 0;
|
||||
*raw++ = 0;
|
||||
*raw++ = 0;
|
||||
|
||||
// mov rax, m_hash_start
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0xb8;
|
||||
std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
|
||||
raw += 8;
|
||||
|
||||
// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
|
||||
*raw++ = 0x49;
|
||||
*raw++ = 0xc7;
|
||||
*raw++ = 0x89;
|
||||
*raw++ = 0x45;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
||||
*raw++ = 0xff;
|
||||
*raw++ = 0xff;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
|
||||
// Load PC: mov eax, [r13 + spu_thread::pc]
|
||||
*raw++ = 0x41;
|
||||
@ -8445,16 +8544,6 @@ struct spu_fast : public spu_recompiler_base
|
||||
// trap
|
||||
//*raw++ = 0xcc;
|
||||
|
||||
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xfffe
|
||||
*raw++ = 0x49;
|
||||
*raw++ = 0xc7;
|
||||
*raw++ = 0x45;
|
||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
||||
*raw++ = 0xfe;
|
||||
*raw++ = 0xff;
|
||||
*raw++ = 0x00;
|
||||
*raw++ = 0x00;
|
||||
|
||||
// Secondary prologue: sub rsp,0x28
|
||||
*raw++ = 0x48;
|
||||
*raw++ = 0x83;
|
||||
@ -8667,7 +8756,7 @@ struct spu_fast : public spu_recompiler_base
|
||||
if (added)
|
||||
{
|
||||
// Send work to LLVM compiler thread
|
||||
g_fxo->get<spu_llvm_thread>()->registered.push(add_loc);
|
||||
g_fxo->get<spu_llvm_thread>()->registered.push(m_hash_start, add_loc);
|
||||
}
|
||||
|
||||
// Rebuild trampoline if necessary
|
||||
|
@ -262,5 +262,18 @@ namespace stx
|
||||
{
|
||||
return (m_state & c_init_bit) != 0;
|
||||
}
|
||||
|
||||
// Wait for access()
|
||||
void wait_for_initialized() const noexcept
|
||||
{
|
||||
const u32 state = m_state;
|
||||
|
||||
if (state & c_init_bit)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
m_state.wait(state);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user