mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-16 07:20:59 +00:00
SPU Profiler preview
Add option "SPU Profiler" (disabled by default). Works only with SPU recompilers. Results are flushed on pausing.
This commit is contained in:
parent
1a9e06d3c6
commit
c69fe0f664
@ -9,6 +9,8 @@
|
|||||||
#include "Emu/Cell/SPUThread.h"
|
#include "Emu/Cell/SPUThread.h"
|
||||||
|
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
DECLARE(cpu_thread::g_threads_created){0};
|
DECLARE(cpu_thread::g_threads_created){0};
|
||||||
DECLARE(cpu_thread::g_threads_deleted){0};
|
DECLARE(cpu_thread::g_threads_deleted){0};
|
||||||
@ -46,6 +48,196 @@ void fmt_class_string<bs_t<cpu_flag>>::format(std::string& out, u64 arg)
|
|||||||
format_bitset(out, arg, "[", "|", "]", &fmt_class_string<cpu_flag>::format);
|
format_bitset(out, arg, "[", "|", "]", &fmt_class_string<cpu_flag>::format);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CPU profiler thread
|
||||||
|
struct cpu_prof
|
||||||
|
{
|
||||||
|
// PPU/SPU id enqueued for registration
|
||||||
|
lf_queue<u32> registered;
|
||||||
|
|
||||||
|
struct sample_info
|
||||||
|
{
|
||||||
|
// Weak pointer to the thread
|
||||||
|
std::weak_ptr<cpu_thread> wptr;
|
||||||
|
|
||||||
|
// Block occurences: name -> sample_count
|
||||||
|
std::unordered_map<u64, u64, value_hash<u64>> freq;
|
||||||
|
|
||||||
|
// Total number of samples
|
||||||
|
u64 samples = 0, idle = 0;
|
||||||
|
|
||||||
|
sample_info(const std::shared_ptr<cpu_thread>& ptr)
|
||||||
|
: wptr(ptr)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset()
|
||||||
|
{
|
||||||
|
freq.clear();
|
||||||
|
samples = 0;
|
||||||
|
idle = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print info
|
||||||
|
void print(u32 id) const
|
||||||
|
{
|
||||||
|
// Make reversed map: sample_count -> name
|
||||||
|
std::multimap<u64, u64> chart;
|
||||||
|
|
||||||
|
for (auto& [name, count] : freq)
|
||||||
|
{
|
||||||
|
// Inverse bits to sort in descending order
|
||||||
|
chart.emplace(~count, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print results
|
||||||
|
std::string results;
|
||||||
|
results.reserve(5100);
|
||||||
|
|
||||||
|
// Fraction of non-idle samples
|
||||||
|
const f64 busy = 1. * (samples - idle) / samples;
|
||||||
|
|
||||||
|
for (auto& [rcount, name] : chart)
|
||||||
|
{
|
||||||
|
// Get correct count value
|
||||||
|
const u64 count = ~rcount;
|
||||||
|
const f64 _frac = count / busy / samples;
|
||||||
|
|
||||||
|
// Print only 7 hash characters out of 11 (which covers roughly 48 bits)
|
||||||
|
fmt::append(results, "\n\t[%s", fmt::base57(be_t<u64>{name}));
|
||||||
|
results.resize(results.size() - 4);
|
||||||
|
|
||||||
|
// Print chunk address from lowest 16 bits
|
||||||
|
fmt::append(results, "...chunk-0x%05x]: %.4f%% (%u)", (name & 0xffff) * 4, _frac * 100., count);
|
||||||
|
|
||||||
|
if (results.size() >= 5000)
|
||||||
|
{
|
||||||
|
// Stop printing after reaching some arbitrary limit in characters
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_NOTICE(GENERAL, "Thread [0x%08x]: %u samples (%.4f%% idle):%s", id, samples, 100. * idle / samples, results);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void operator()()
|
||||||
|
{
|
||||||
|
std::unordered_map<u32, sample_info, value_hash<u64>> threads;
|
||||||
|
|
||||||
|
while (thread_ctrl::state() != thread_state::aborting)
|
||||||
|
{
|
||||||
|
bool flush = false;
|
||||||
|
|
||||||
|
// Handle registration channel
|
||||||
|
for (u32 id : registered.pop_all())
|
||||||
|
{
|
||||||
|
if (id == 0)
|
||||||
|
{
|
||||||
|
// Handle id zero as a command to flush results
|
||||||
|
flush = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<cpu_thread> ptr;
|
||||||
|
|
||||||
|
if (id >> 24 == 1)
|
||||||
|
{
|
||||||
|
ptr = idm::get<named_thread<ppu_thread>>(id);
|
||||||
|
}
|
||||||
|
else if (id >> 24 == 2)
|
||||||
|
{
|
||||||
|
ptr = idm::get<named_thread<spu_thread>>(id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LOG_FATAL(GENERAL, "Invalid Thread ID: 0x%08x", id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr)
|
||||||
|
{
|
||||||
|
auto [found, add] = threads.try_emplace(id, ptr);
|
||||||
|
|
||||||
|
if (!add)
|
||||||
|
{
|
||||||
|
// Overwritten: print previous data
|
||||||
|
found->second.print(id);
|
||||||
|
found->second.reset();
|
||||||
|
found->second.wptr = ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threads.empty())
|
||||||
|
{
|
||||||
|
// Wait for messages if no work (don't waste CPU)
|
||||||
|
registered.wait();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample active threads
|
||||||
|
for (auto& [id, info] : threads)
|
||||||
|
{
|
||||||
|
if (auto ptr = info.wptr.lock())
|
||||||
|
{
|
||||||
|
// Get short function hash
|
||||||
|
const u64 name = atomic_storage<u64>::load(ptr->block_hash);
|
||||||
|
|
||||||
|
// Append occurrence
|
||||||
|
info.samples++;
|
||||||
|
|
||||||
|
if (!(ptr->state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause)))
|
||||||
|
{
|
||||||
|
info.freq[name]++;
|
||||||
|
|
||||||
|
// Append verification time to fixed common name 0000000...chunk-0x3fffc
|
||||||
|
if ((name & 0xffff) == 0)
|
||||||
|
info.freq[0xffff]++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
info.idle++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup and print results for deleted threads
|
||||||
|
for (auto it = threads.begin(), end = threads.end(); it != end;)
|
||||||
|
{
|
||||||
|
if (it->second.wptr.expired())
|
||||||
|
it->second.print(it->first), it = threads.erase(it);
|
||||||
|
else
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flush)
|
||||||
|
{
|
||||||
|
LOG_SUCCESS(GENERAL, "Flushing profiling results...");
|
||||||
|
|
||||||
|
// Print all results and cleanup
|
||||||
|
for (auto& [id, info] : threads)
|
||||||
|
{
|
||||||
|
info.print(id);
|
||||||
|
info.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait, roughly for 20µs
|
||||||
|
thread_ctrl::wait_for(20, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print all remaining results
|
||||||
|
for (auto& [id, info] : threads)
|
||||||
|
{
|
||||||
|
info.print(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr auto thread_name = "CPU Profiler"sv;
|
||||||
|
};
|
||||||
|
|
||||||
|
using cpu_profiler = named_thread<cpu_prof>;
|
||||||
|
|
||||||
thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
|
thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
|
||||||
|
|
||||||
// For synchronizing suspend_all operation
|
// For synchronizing suspend_all operation
|
||||||
@ -91,6 +283,16 @@ void cpu_thread::operator()()
|
|||||||
thread_ctrl::set_native_priority(-1);
|
thread_ctrl::set_native_priority(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (id_type() == 1 && false)
|
||||||
|
{
|
||||||
|
g_fxo->get<cpu_profiler>()->registered.push(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (id_type() == 2 && g_cfg.core.spu_prof)
|
||||||
|
{
|
||||||
|
g_fxo->get<cpu_profiler>()->registered.push(id);
|
||||||
|
}
|
||||||
|
|
||||||
// Register thread in g_cpu_array
|
// Register thread in g_cpu_array
|
||||||
if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8))
|
if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8))
|
||||||
{
|
{
|
||||||
@ -418,5 +620,22 @@ void cpu_thread::stop_all() noexcept
|
|||||||
std::this_thread::sleep_for(10ms);
|
std::this_thread::sleep_for(10ms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Workaround for remaining threads (TODO)
|
||||||
|
std::this_thread::sleep_for(1300ms);
|
||||||
|
|
||||||
LOG_NOTICE(GENERAL, "All CPU threads have been stopped.");
|
LOG_NOTICE(GENERAL, "All CPU threads have been stopped.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cpu_thread::flush_profilers() noexcept
|
||||||
|
{
|
||||||
|
if (!g_fxo->get<cpu_profiler>())
|
||||||
|
{
|
||||||
|
LOG_FATAL(GENERAL, "cpu_thread::flush_profilers() has been called incorrectly." HERE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_cfg.core.spu_prof || false)
|
||||||
|
{
|
||||||
|
g_fxo->get<cpu_profiler>()->registered.push(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -27,7 +27,10 @@ enum class cpu_flag : u32
|
|||||||
class cpu_thread
|
class cpu_thread
|
||||||
{
|
{
|
||||||
// PPU cache backward compatibility hack
|
// PPU cache backward compatibility hack
|
||||||
char dummy[sizeof(std::shared_ptr<void>)];
|
char dummy[sizeof(std::shared_ptr<void>) - 8];
|
||||||
|
|
||||||
|
public:
|
||||||
|
u64 block_hash = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
cpu_thread(u32 id);
|
cpu_thread(u32 id);
|
||||||
@ -119,6 +122,9 @@ public:
|
|||||||
|
|
||||||
// Stop all threads with cpu_flag::dbg_global_stop
|
// Stop all threads with cpu_flag::dbg_global_stop
|
||||||
static void stop_all() noexcept;
|
static void stop_all() noexcept;
|
||||||
|
|
||||||
|
// Send signal to the profiler(s) to flush results
|
||||||
|
static void flush_profilers() noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline cpu_thread* get_current_cpu_thread() noexcept
|
inline cpu_thread* get_current_cpu_thread() noexcept
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include "Utilities/sysinfo.h"
|
#include "Utilities/sysinfo.h"
|
||||||
#include "Utilities/asm.h"
|
#include "Utilities/asm.h"
|
||||||
#include "PPUAnalyser.h"
|
#include "PPUAnalyser.h"
|
||||||
|
#include "Crypto/sha1.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
@ -63,6 +64,19 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
|
|||||||
cache->add(func);
|
cache->add(func);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
sha1_context ctx;
|
||||||
|
u8 output[20];
|
||||||
|
|
||||||
|
sha1_starts(&ctx);
|
||||||
|
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
|
||||||
|
sha1_finish(&ctx, output);
|
||||||
|
|
||||||
|
be_t<u64> hash_start;
|
||||||
|
std::memcpy(&hash_start, output, sizeof(hash_start));
|
||||||
|
m_hash_start = hash_start;
|
||||||
|
}
|
||||||
|
|
||||||
using namespace asmjit;
|
using namespace asmjit;
|
||||||
|
|
||||||
StringLogger logger;
|
StringLogger logger;
|
||||||
@ -160,6 +174,12 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
|
|||||||
c->cmp(SPU_OFF_32(state), 0);
|
c->cmp(SPU_OFF_32(state), 0);
|
||||||
c->jnz(label_stop);
|
c->jnz(label_stop);
|
||||||
|
|
||||||
|
if (g_cfg.core.spu_prof && g_cfg.core.spu_verification)
|
||||||
|
{
|
||||||
|
c->mov(x86::rax, m_hash_start & -0xffff);
|
||||||
|
c->mov(SPU_OFF_64(block_hash), x86::rax);
|
||||||
|
}
|
||||||
|
|
||||||
if (utils::has_avx())
|
if (utils::has_avx())
|
||||||
{
|
{
|
||||||
// How to check dirty AVX state
|
// How to check dirty AVX state
|
||||||
@ -722,6 +742,13 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
|
|||||||
// Acknowledge success and add statistics
|
// Acknowledge success and add statistics
|
||||||
c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4));
|
c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4));
|
||||||
|
|
||||||
|
// Set block hash for profiling (if enabled)
|
||||||
|
if (g_cfg.core.spu_prof)
|
||||||
|
{
|
||||||
|
c->mov(x86::rax, m_hash_start | 0xffff);
|
||||||
|
c->mov(SPU_OFF_64(block_hash), x86::rax);
|
||||||
|
}
|
||||||
|
|
||||||
if (m_pos != start)
|
if (m_pos != start)
|
||||||
{
|
{
|
||||||
// Jump to the entry point if necessary
|
// Jump to the entry point if necessary
|
||||||
@ -1159,6 +1186,14 @@ void spu_recompiler::branch_set_link(u32 target)
|
|||||||
c->and_(qw1->r32(), 0x3fff0);
|
c->and_(qw1->r32(), 0x3fff0);
|
||||||
c->pcmpeqd(x86::xmm0, x86::xmm0);
|
c->pcmpeqd(x86::xmm0, x86::xmm0);
|
||||||
c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)), x86::xmm0);
|
c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)), x86::xmm0);
|
||||||
|
|
||||||
|
// Set block hash for profiling (if enabled)
|
||||||
|
if (g_cfg.core.spu_prof)
|
||||||
|
{
|
||||||
|
c->mov(x86::rax, m_hash_start | 0xffff);
|
||||||
|
c->mov(SPU_OFF_64(block_hash), x86::rax);
|
||||||
|
}
|
||||||
|
|
||||||
c->jmp(target);
|
c->jmp(target);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -137,6 +137,16 @@ DECLARE(spu_runtime::tr_all) = []
|
|||||||
std::memcpy(raw, &r32, 4);
|
std::memcpy(raw, &r32, 4);
|
||||||
raw += 4;
|
raw += 4;
|
||||||
|
|
||||||
|
// Update block_hash (set zero): mov [r13 + spu_thread::m_block_hash], 0
|
||||||
|
*raw++ = 0x49;
|
||||||
|
*raw++ = 0xc7;
|
||||||
|
*raw++ = 0x45;
|
||||||
|
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
|
||||||
|
*raw++ = 0x00;
|
||||||
|
*raw++ = 0x00;
|
||||||
|
*raw++ = 0x00;
|
||||||
|
*raw++ = 0x00;
|
||||||
|
|
||||||
// jmp [rdx + rax * 8]
|
// jmp [rdx + rax * 8]
|
||||||
*raw++ = 0xff;
|
*raw++ = 0xff;
|
||||||
*raw++ = 0x24;
|
*raw++ = 0x24;
|
||||||
@ -4257,6 +4267,10 @@ public:
|
|||||||
|
|
||||||
m_hash.clear();
|
m_hash.clear();
|
||||||
fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output));
|
fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output));
|
||||||
|
|
||||||
|
be_t<u64> hash_start;
|
||||||
|
std::memcpy(&hash_start, output, sizeof(hash_start));
|
||||||
|
m_hash_start = hash_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (g_fxo->get<spu_cache>())
|
if (g_fxo->get<spu_cache>())
|
||||||
@ -4318,6 +4332,10 @@ public:
|
|||||||
u32 check_iterations = 0;
|
u32 check_iterations = 0;
|
||||||
m_ir->SetInsertPoint(label_test);
|
m_ir->SetInsertPoint(label_test);
|
||||||
|
|
||||||
|
// Set block hash for profiling (if enabled)
|
||||||
|
if (g_cfg.core.spu_prof && g_cfg.core.spu_verification)
|
||||||
|
m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr<u64>(&spu_thread::block_hash), true);
|
||||||
|
|
||||||
if (!g_cfg.core.spu_verification)
|
if (!g_cfg.core.spu_verification)
|
||||||
{
|
{
|
||||||
// Disable check (unsafe)
|
// Disable check (unsafe)
|
||||||
@ -4507,6 +4525,11 @@ public:
|
|||||||
// Initialize function info
|
// Initialize function info
|
||||||
m_entry = m_function_queue[fi];
|
m_entry = m_function_queue[fi];
|
||||||
set_function(m_functions[m_entry].chunk);
|
set_function(m_functions[m_entry].chunk);
|
||||||
|
|
||||||
|
// Set block hash for profiling (if enabled)
|
||||||
|
if (g_cfg.core.spu_prof)
|
||||||
|
m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr<u64>(&spu_thread::block_hash), true);
|
||||||
|
|
||||||
m_finfo = &m_functions[m_entry];
|
m_finfo = &m_functions[m_entry];
|
||||||
m_ir->CreateBr(add_block(m_entry));
|
m_ir->CreateBr(add_block(m_entry));
|
||||||
|
|
||||||
|
@ -241,6 +241,7 @@ protected:
|
|||||||
|
|
||||||
u32 m_pos;
|
u32 m_pos;
|
||||||
u32 m_size;
|
u32 m_size;
|
||||||
|
u64 m_hash_start;
|
||||||
|
|
||||||
// Bit indicating start of the block
|
// Bit indicating start of the block
|
||||||
std::bitset<0x10000> m_block_info;
|
std::bitset<0x10000> m_block_info;
|
||||||
|
@ -1677,6 +1677,9 @@ bool Emulator::Pause()
|
|||||||
return m_state.compare_and_swap_test(system_state::ready, system_state::paused);
|
return m_state.compare_and_swap_test(system_state::ready, system_state::paused);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Signal profilers to print results (if enabled)
|
||||||
|
cpu_thread::flush_profilers();
|
||||||
|
|
||||||
GetCallbacks().on_pause();
|
GetCallbacks().on_pause();
|
||||||
|
|
||||||
// Update pause start time
|
// Update pause start time
|
||||||
|
@ -408,6 +408,7 @@ struct cfg_root : cfg::node
|
|||||||
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
|
cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
|
||||||
cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
|
cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
|
||||||
cfg::_bool spu_cache{this, "SPU Cache", true};
|
cfg::_bool spu_cache{this, "SPU Cache", true};
|
||||||
|
cfg::_bool spu_prof{this, "SPU Profiler", false};
|
||||||
cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
|
cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
|
||||||
cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false};
|
cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false};
|
||||||
cfg::_bool spu_approx_xfloat{this, "Approximate xfloat", true};
|
cfg::_bool spu_approx_xfloat{this, "Approximate xfloat", true};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user