mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-02-21 00:39:53 +00:00
SPU LLVM: add SPU profiling to compilation thread
Run another thread to collect profile data from SPU threads. Use this data to prioritize compiling hot spot SPU blocks. Implement stx::init_mutex::wait_for_initialized() helper.
This commit is contained in:
parent
a21d7def33
commit
abbf3c4d16
@ -7,6 +7,7 @@
|
|||||||
#include "Utilities/StrUtil.h"
|
#include "Utilities/StrUtil.h"
|
||||||
#include "Utilities/JIT.h"
|
#include "Utilities/JIT.h"
|
||||||
#include "Utilities/sysinfo.h"
|
#include "Utilities/sysinfo.h"
|
||||||
|
#include "util/init_mutex.hpp"
|
||||||
|
|
||||||
#include "SPUThread.h"
|
#include "SPUThread.h"
|
||||||
#include "SPUAnalyser.h"
|
#include "SPUAnalyser.h"
|
||||||
@ -8266,7 +8267,7 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
|
|||||||
struct spu_llvm
|
struct spu_llvm
|
||||||
{
|
{
|
||||||
// Workload
|
// Workload
|
||||||
lf_queue<spu_item*> registered;
|
lf_queue<std::pair<const u64, spu_item*>> registered;
|
||||||
|
|
||||||
void operator()()
|
void operator()()
|
||||||
{
|
{
|
||||||
@ -8277,22 +8278,96 @@ struct spu_llvm
|
|||||||
// Fake LS
|
// Fake LS
|
||||||
std::vector<be_t<u32>> ls(0x10000);
|
std::vector<be_t<u32>> ls(0x10000);
|
||||||
|
|
||||||
for (auto* parg : registered)
|
// To compile (hash -> item)
|
||||||
|
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
|
||||||
|
|
||||||
|
// Mini-profiler (hash -> number of occurrences)
|
||||||
|
std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
|
||||||
|
|
||||||
|
// For synchronization with profiler thread
|
||||||
|
stx::init_mutex prof_mutex;
|
||||||
|
|
||||||
|
named_thread profiler("SPU LLVM Profiler"sv, [&]()
|
||||||
{
|
{
|
||||||
if (thread_ctrl::state() == thread_state::aborting)
|
while (thread_ctrl::state() != thread_state::aborting)
|
||||||
{
|
{
|
||||||
break;
|
{
|
||||||
|
// Lock if enabled
|
||||||
|
const auto lock = prof_mutex.access();
|
||||||
|
|
||||||
|
if (!lock)
|
||||||
|
{
|
||||||
|
// Wait when the profiler is disabled
|
||||||
|
prof_mutex.wait_for_initialized();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect profiling samples
|
||||||
|
idm::select<named_thread<spu_thread>>([&](u32 id, spu_thread& spu)
|
||||||
|
{
|
||||||
|
const u64 name = atomic_storage<u64>::load(spu.block_hash);
|
||||||
|
|
||||||
|
if (!(spu.state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause)))
|
||||||
|
{
|
||||||
|
const auto found = std::as_const(samples).find(spu.block_hash);
|
||||||
|
|
||||||
|
if (found != std::as_const(samples).end())
|
||||||
|
{
|
||||||
|
const_cast<atomic_t<u64>&>(found->second)++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep for a short period if enabled
|
||||||
|
thread_ctrl::wait_for(20, false);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
while (thread_ctrl::state() != thread_state::aborting)
|
||||||
|
{
|
||||||
|
for (const auto& pair : registered.pop_all())
|
||||||
|
{
|
||||||
|
enqueued.emplace(pair);
|
||||||
|
|
||||||
|
// Interrupt and kick profiler thread
|
||||||
|
const auto lock = prof_mutex.init_always([&]{});
|
||||||
|
|
||||||
|
// Register new blocks to collect samples
|
||||||
|
samples.emplace(pair.first, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!parg)
|
if (enqueued.empty())
|
||||||
{
|
{
|
||||||
|
// Interrupt profiler thread and put it to sleep
|
||||||
|
static_cast<void>(prof_mutex.reset());
|
||||||
|
registered.wait();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<u32>& func = (*parg)->data;
|
// Find the most used enqueued item
|
||||||
|
u64 sample_max = 0;
|
||||||
|
auto found_it = enqueued.begin();
|
||||||
|
|
||||||
|
for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
|
||||||
|
{
|
||||||
|
const u64 cur = std::as_const(samples).at(it->first);
|
||||||
|
|
||||||
|
if (cur > sample_max)
|
||||||
|
{
|
||||||
|
sample_max = cur;
|
||||||
|
found_it = it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start compiling
|
||||||
|
const std::vector<u32>& func = found_it->second->data;
|
||||||
|
|
||||||
// Old function pointer (pre-recompiled)
|
// Old function pointer (pre-recompiled)
|
||||||
const spu_function_t _old = (*parg)->compiled;
|
const spu_function_t _old = found_it->second->compiled;
|
||||||
|
|
||||||
|
// Remove item from the queue
|
||||||
|
enqueued.erase(found_it);
|
||||||
|
|
||||||
// Get data start
|
// Get data start
|
||||||
const u32 start = func[0];
|
const u32 start = func[0];
|
||||||
@ -8381,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocate executable area with necessary size
|
// Allocate executable area with necessary size
|
||||||
const auto result = jit_runtime::alloc(16 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
|
const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
|
||||||
|
|
||||||
if (!result)
|
if (!result)
|
||||||
{
|
{
|
||||||
@ -8391,18 +8466,42 @@ struct spu_fast : public spu_recompiler_base
|
|||||||
m_pos = func[0];
|
m_pos = func[0];
|
||||||
m_size = (::size32(func) - 1) * 4;
|
m_size = (::size32(func) - 1) * 4;
|
||||||
|
|
||||||
|
{
|
||||||
|
sha1_context ctx;
|
||||||
|
u8 output[20];
|
||||||
|
|
||||||
|
sha1_starts(&ctx);
|
||||||
|
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
|
||||||
|
sha1_finish(&ctx, output);
|
||||||
|
|
||||||
|
be_t<u64> hash_start;
|
||||||
|
std::memcpy(&hash_start, output, sizeof(hash_start));
|
||||||
|
m_hash_start = hash_start;
|
||||||
|
}
|
||||||
|
|
||||||
u8* raw = result;
|
u8* raw = result;
|
||||||
|
|
||||||
// 8-byte intruction for patching
|
// 8-byte intruction for patching (long NOP)
|
||||||
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xffff
|
*raw++ = 0x0f;
|
||||||
|
*raw++ = 0x1f;
|
||||||
|
*raw++ = 0x84;
|
||||||
|
*raw++ = 0;
|
||||||
|
*raw++ = 0;
|
||||||
|
*raw++ = 0;
|
||||||
|
*raw++ = 0;
|
||||||
|
*raw++ = 0;
|
||||||
|
|
||||||
|
// mov rax, m_hash_start
|
||||||
|
*raw++ = 0x48;
|
||||||
|
*raw++ = 0xb8;
|
||||||
|
std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
|
||||||
|
raw += 8;
|
||||||
|
|
||||||
|
// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
|
||||||
*raw++ = 0x49;
|
*raw++ = 0x49;
|
||||||
*raw++ = 0xc7;
|
*raw++ = 0x89;
|
||||||
*raw++ = 0x45;
|
*raw++ = 0x45;
|
||||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
||||||
*raw++ = 0xff;
|
|
||||||
*raw++ = 0xff;
|
|
||||||
*raw++ = 0x00;
|
|
||||||
*raw++ = 0x00;
|
|
||||||
|
|
||||||
// Load PC: mov eax, [r13 + spu_thread::pc]
|
// Load PC: mov eax, [r13 + spu_thread::pc]
|
||||||
*raw++ = 0x41;
|
*raw++ = 0x41;
|
||||||
@ -8445,16 +8544,6 @@ struct spu_fast : public spu_recompiler_base
|
|||||||
// trap
|
// trap
|
||||||
//*raw++ = 0xcc;
|
//*raw++ = 0xcc;
|
||||||
|
|
||||||
// Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xfffe
|
|
||||||
*raw++ = 0x49;
|
|
||||||
*raw++ = 0xc7;
|
|
||||||
*raw++ = 0x45;
|
|
||||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
|
||||||
*raw++ = 0xfe;
|
|
||||||
*raw++ = 0xff;
|
|
||||||
*raw++ = 0x00;
|
|
||||||
*raw++ = 0x00;
|
|
||||||
|
|
||||||
// Secondary prologue: sub rsp,0x28
|
// Secondary prologue: sub rsp,0x28
|
||||||
*raw++ = 0x48;
|
*raw++ = 0x48;
|
||||||
*raw++ = 0x83;
|
*raw++ = 0x83;
|
||||||
@ -8667,7 +8756,7 @@ struct spu_fast : public spu_recompiler_base
|
|||||||
if (added)
|
if (added)
|
||||||
{
|
{
|
||||||
// Send work to LLVM compiler thread
|
// Send work to LLVM compiler thread
|
||||||
g_fxo->get<spu_llvm_thread>()->registered.push(add_loc);
|
g_fxo->get<spu_llvm_thread>()->registered.push(m_hash_start, add_loc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rebuild trampoline if necessary
|
// Rebuild trampoline if necessary
|
||||||
|
@ -262,5 +262,18 @@ namespace stx
|
|||||||
{
|
{
|
||||||
return (m_state & c_init_bit) != 0;
|
return (m_state & c_init_bit) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wait for access()
|
||||||
|
void wait_for_initialized() const noexcept
|
||||||
|
{
|
||||||
|
const u32 state = m_state;
|
||||||
|
|
||||||
|
if (state & c_init_bit)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_state.wait(state);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user