mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-02-07 03:40:07 +00:00
Implement perf stat counter for PPU/SPU reservation ops
Adds Emu/perf_meter.hpp header file. Uses RDTSC for speed. Prints stats at exit.
This commit is contained in:
parent
adf50b7c4b
commit
120849c734
@ -7,6 +7,7 @@
|
||||
VFS.cpp
|
||||
GDB.cpp
|
||||
title.cpp
|
||||
perf_meter.cpp
|
||||
)
|
||||
|
||||
# prevent WolfSSL from warning about not having harden options
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "Emu/GDB.h"
|
||||
#include "Emu/Cell/PPUThread.h"
|
||||
#include "Emu/Cell/SPUThread.h"
|
||||
#include "Emu/perf_meter.hpp"
|
||||
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
@ -775,6 +776,9 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
|
||||
|
||||
if (!next)
|
||||
{
|
||||
// Monitor the performance only of the actual suspend processing owner
|
||||
perf_meter<"SUSPEND"_u64> perf0;
|
||||
|
||||
// First thread to push the work to the workload list pauses all threads and processes it
|
||||
std::lock_guard lock(ctr->cpu_suspend_lock);
|
||||
|
||||
@ -911,13 +915,12 @@ void cpu_thread::stop_all() noexcept
|
||||
std::this_thread::sleep_for(10ms);
|
||||
}
|
||||
|
||||
sys_log.notice("All CPU threads have been stopped. [+: %u; suspends: %u]", +g_threads_created, +g_suspend_counter);
|
||||
sys_log.notice("All CPU threads have been stopped. [+: %u]", +g_threads_created);
|
||||
|
||||
std::lock_guard lock(g_fxo->get<cpu_counter>()->cpu_suspend_lock);
|
||||
|
||||
g_threads_deleted -= g_threads_created.load();
|
||||
g_threads_created = 0;
|
||||
g_suspend_counter = 0;
|
||||
}
|
||||
|
||||
void cpu_thread::flush_profilers() noexcept
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "Utilities/sysinfo.h"
|
||||
#include "Utilities/JIT.h"
|
||||
#include "Crypto/sha1.h"
|
||||
#include "Emu/perf_meter.hpp"
|
||||
#include "Emu/Memory/vm_reservation.h"
|
||||
#include "Emu/RSX/RSXThread.h"
|
||||
#include "Emu/VFS.h"
|
||||
@ -1282,9 +1283,9 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
|
||||
std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8);
|
||||
}
|
||||
|
||||
if (count >= 15) [[unlikely]]
|
||||
if (count >= 15 && g_cfg.core.perf_report) [[unlikely]]
|
||||
{
|
||||
ppu_log.warning("%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count);
|
||||
perf_log.warning("%s: took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count);
|
||||
}
|
||||
|
||||
ppu.rtime &= ~vm::rsrv_shared_mask;
|
||||
@ -1553,6 +1554,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
|
||||
template <typename T>
|
||||
static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
|
||||
{
|
||||
perf_meter<"STCX"_u32> perf0;
|
||||
|
||||
if (addr % sizeof(T))
|
||||
{
|
||||
fmt::throw_exception("PPU %s: Unaligned address: 0x%08x" HERE, sizeof(T) == 4 ? "STWCX" : "STDCX", addr);
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "Loader/ELF.h"
|
||||
#include "Emu/VFS.h"
|
||||
#include "Emu/IdManager.h"
|
||||
#include "Emu/perf_meter.hpp"
|
||||
#include "Emu/RSX/RSXThread.h"
|
||||
#include "Emu/Cell/PPUThread.h"
|
||||
#include "Emu/Cell/ErrorCodes.h"
|
||||
@ -2055,11 +2056,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
|
||||
|
||||
bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
{
|
||||
perf_meter<"PUTLLC-"_u64> perf0;
|
||||
perf_meter<"PUTLLC+"_u64> perf1 = perf0;
|
||||
|
||||
// Store conditionally
|
||||
const u32 addr = args.eal & -128;
|
||||
|
||||
if ([&]()
|
||||
{
|
||||
perf_meter<"PUTLLC."_u64> perf2 = perf0;
|
||||
|
||||
if (raddr != addr)
|
||||
{
|
||||
return false;
|
||||
@ -2163,6 +2169,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
{
|
||||
vm::reservation_notifier(addr, 128).notify_all();
|
||||
raddr = 0;
|
||||
perf0.reset();
|
||||
return true;
|
||||
}
|
||||
else
|
||||
@ -2182,12 +2189,15 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
}
|
||||
|
||||
raddr = 0;
|
||||
perf1.reset();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
{
|
||||
perf_meter<"STORE128"_u64> perf0;
|
||||
|
||||
const auto cpu = get_current_cpu_thread();
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
@ -2237,6 +2247,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
|
||||
void spu_thread::do_putlluc(const spu_mfc_cmd& args)
|
||||
{
|
||||
perf_meter<"PUTLLUC"_u64> perf0;
|
||||
|
||||
const u32 addr = args.eal & -128;
|
||||
|
||||
if (raddr && addr == raddr)
|
||||
@ -2419,6 +2431,8 @@ bool spu_thread::process_mfc_cmd()
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
perf_meter<"GETLLAR"_u64> perf0;
|
||||
|
||||
alignas(64) spu_rdata_t temp;
|
||||
u64 ntime;
|
||||
|
||||
@ -2506,9 +2520,9 @@ bool spu_thread::process_mfc_cmd()
|
||||
continue;
|
||||
}
|
||||
|
||||
if (g_use_rtm && i >= 15) [[unlikely]]
|
||||
if (g_use_rtm && i >= 15 && g_cfg.core.perf_report) [[unlikely]]
|
||||
{
|
||||
spu_log.warning("GETLLAR took too long: %u", i);
|
||||
perf_log.warning("GETLLAR: took too long: %u", i);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "Emu/Cell/lv2/sys_memory.h"
|
||||
#include "Emu/RSX/GSRender.h"
|
||||
#include "Emu/Cell/SPURecompiler.h"
|
||||
#include <atomic>
|
||||
#include "Emu/perf_meter.hpp"
|
||||
#include <thread>
|
||||
#include <deque>
|
||||
|
||||
@ -391,6 +391,8 @@ namespace vm
|
||||
|
||||
if (addr >= 0x10000)
|
||||
{
|
||||
perf_meter<"SUSPEND"_u64> perf0;
|
||||
|
||||
for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++)
|
||||
{
|
||||
if (auto ptr = +*lock; ptr && !(ptr->state & cpu_flag::memory))
|
||||
|
58
rpcs3/Emu/perf_meter.cpp
Normal file
58
rpcs3/Emu/perf_meter.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
#include "stdafx.h"
|
||||
#include "perf_meter.hpp"
|
||||
|
||||
void perf_stat_base::push(u64 ns[66]) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 66; i++)
|
||||
{
|
||||
m_log[i] += ns[i];
|
||||
}
|
||||
}
|
||||
|
||||
void perf_stat_base::print(const char* name) noexcept
|
||||
{
|
||||
if (u64 num_total = m_log[0].load())
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: total events: %u (total time %.4fs)", name, num_total, m_log[65].load() / 1000'000'000.);
|
||||
|
||||
for (u32 i = 0; i < 13; i++)
|
||||
{
|
||||
if (u64 count = m_log[i + 1].load())
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: events < %.3fµs: %u", name, std::pow(2., i) / 1000., count);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 14; i < 23; i++)
|
||||
{
|
||||
if (u64 count = m_log[i + 1].load()) [[unlikely]]
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: events < %.3fms: %u", name, std::pow(2., i) / 1000'000., count);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 24; i < 33; i++)
|
||||
{
|
||||
if (u64 count = m_log[i + 1].load()) [[unlikely]]
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: events < %.3fs: %u", name, std::pow(2., i) / 1000'000'000., count);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 34; i < 43; i++)
|
||||
{
|
||||
if (u64 count = m_log[i + 1].load()) [[unlikely]]
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: events < %.0f SEC: %u", name, std::pow(2., i) / 1000'000'000., count);
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 i = 44; i < 63; i++)
|
||||
{
|
||||
if (u64 count = m_log[i + 1].load()) [[unlikely]]
|
||||
{
|
||||
perf_log.notice("Perf stats for %s: events < %.0f MIN: %u", name, std::pow(2., i) / 60'000'000'000., count);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
181
rpcs3/Emu/perf_meter.hpp
Normal file
181
rpcs3/Emu/perf_meter.hpp
Normal file
@ -0,0 +1,181 @@
|
||||
#pragma once
|
||||
|
||||
#include "Utilities/types.h"
|
||||
#include "util/logs.hpp"
|
||||
#include "Utilities/sysinfo.h"
|
||||
#include "system_config.h"
|
||||
#include "IdManager.h"
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
LOG_CHANNEL(perf_log, "PERF");
|
||||
|
||||
// TODO: constexpr with the help of bitcast
|
||||
template <auto Name>
|
||||
inline const auto perf_name = []
|
||||
{
|
||||
constexpr auto short_name = Name;
|
||||
std::array<char, sizeof(Name) + 1> result{};
|
||||
std::memcpy(result.data(), &short_name, sizeof(Name));
|
||||
return result;
|
||||
}();
|
||||
|
||||
class perf_stat_base
|
||||
{
|
||||
atomic_t<u64> m_log[66]{};
|
||||
|
||||
protected:
|
||||
// Print accumulated values
|
||||
void print(const char* name) noexcept;
|
||||
|
||||
// Accumulate values from a thread
|
||||
void push(u64 ns[66]) noexcept;
|
||||
|
||||
public:
|
||||
perf_stat_base() noexcept = default;
|
||||
|
||||
perf_stat_base(const perf_stat_base&) = delete;
|
||||
|
||||
perf_stat_base& operator =(const perf_stat_base&) = delete;
|
||||
|
||||
~perf_stat_base() {}
|
||||
};
|
||||
|
||||
// Object that prints event length stats at the end
|
||||
template <auto ShortName>
|
||||
class perf_stat final : public perf_stat_base
|
||||
{
|
||||
static inline thread_local struct perf_stat_local
|
||||
{
|
||||
// Local non-atomic values for increments
|
||||
u64 m_log[66]{};
|
||||
|
||||
~perf_stat_local()
|
||||
{
|
||||
// Update on thread exit
|
||||
g_fxo->get<perf_stat>()->perf_stat_base::push(m_log);
|
||||
}
|
||||
} g_tls_perf_stat;
|
||||
|
||||
public:
|
||||
~perf_stat()
|
||||
{
|
||||
perf_stat_base::print(perf_name<ShortName>.data());
|
||||
}
|
||||
|
||||
void push(u64 ns) noexcept
|
||||
{
|
||||
auto& data = g_tls_perf_stat.m_log;
|
||||
data[0] += ns != 0;
|
||||
data[64 - std::countl_zero(ns)]++;
|
||||
data[65] += ns;
|
||||
}
|
||||
};
|
||||
|
||||
// Object that prints event length at the end
|
||||
template <auto ShortName, auto... SubEvents>
|
||||
class perf_meter
|
||||
{
|
||||
// Initialize array (possibly only 1 element) with timestamp
|
||||
u64 m_timestamps[1 + sizeof...(SubEvents)] = {__rdtsc()};
|
||||
|
||||
public:
|
||||
SAFE_BUFFERS perf_meter() noexcept
|
||||
{
|
||||
m_timestamps[0] = __rdtsc();
|
||||
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
|
||||
}
|
||||
|
||||
// Copy first timestamp
|
||||
template <auto SN, auto... S>
|
||||
SAFE_BUFFERS perf_meter(const perf_meter<SN, S...>& r) noexcept
|
||||
{
|
||||
m_timestamps[0] = r.get();
|
||||
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
|
||||
}
|
||||
|
||||
template <auto SN, auto... S>
|
||||
SAFE_BUFFERS perf_meter(perf_meter<SN, S...>&& r) noexcept
|
||||
{
|
||||
m_timestamps[0] = r.get();
|
||||
r.reset();
|
||||
}
|
||||
|
||||
// Copy first timestamp
|
||||
template <auto SN, auto... S>
|
||||
SAFE_BUFFERS perf_meter& operator =(const perf_meter<SN, S...>& r) noexcept
|
||||
{
|
||||
m_timestamps[0] = r.get();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <auto SN, auto... S>
|
||||
SAFE_BUFFERS perf_meter& operator =(perf_meter<SN, S...>& r) noexcept
|
||||
{
|
||||
m_timestamps[0] = r.get();
|
||||
r.reset();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Push subevent data in array
|
||||
template <auto Event, std::size_t Index = 0>
|
||||
SAFE_BUFFERS void push() noexcept
|
||||
{
|
||||
// TODO: should use more efficient search with type comparison, then value comparison, or pattern matching
|
||||
if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
|
||||
{
|
||||
// Push actual timestamp into an array
|
||||
m_timestamps[Index + 1] = __rdtsc();
|
||||
}
|
||||
else if constexpr (Index < sizeof...(SubEvents))
|
||||
{
|
||||
// Proceed search recursively
|
||||
push<Event, Index + 1>();
|
||||
}
|
||||
}
|
||||
|
||||
// Obtain initial timestamp
|
||||
u64 get() const noexcept
|
||||
{
|
||||
return m_timestamps[0];
|
||||
}
|
||||
|
||||
// Disable this counter
|
||||
SAFE_BUFFERS void reset() noexcept
|
||||
{
|
||||
m_timestamps[0] = 0;
|
||||
}
|
||||
|
||||
SAFE_BUFFERS ~perf_meter()
|
||||
{
|
||||
// Disabled counter
|
||||
if (!m_timestamps[0]) [[unlikely]]
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!g_cfg.core.perf_report) [[likely]]
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Event end
|
||||
const u64 end_time = __rdtsc();
|
||||
|
||||
// Compute difference in seconds
|
||||
const f64 diff = (end_time - m_timestamps[0]) * 1. / utils::get_tsc_freq();
|
||||
|
||||
// Register perf stat in nanoseconds
|
||||
g_fxo->get<perf_stat<ShortName>>()->push(static_cast<u64>(diff * 1000'000'000.));
|
||||
|
||||
// Print in microseconds
|
||||
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
|
||||
{
|
||||
perf_log.notice("%s: %.3fµs", perf_name<ShortName>.data(), diff * 1000'000.);
|
||||
}
|
||||
|
||||
// TODO: handle push(), currently ignored
|
||||
}
|
||||
};
|
@ -53,7 +53,7 @@ struct cfg_root : cfg::node
|
||||
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
|
||||
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
|
||||
cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
|
||||
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM
|
||||
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM
|
||||
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
|
||||
cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
|
||||
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
|
||||
@ -73,6 +73,9 @@ struct cfg_root : cfg::node
|
||||
#else
|
||||
sleep_timers_accuracy_level::_usleep, true };
|
||||
#endif
|
||||
|
||||
cfg::uint64 perf_report_threshold{this, "Performance Report Threshold", 500, true}; // In µs, 0.5ms = default, 0 = everything
|
||||
cfg::_bool perf_report{this, "Enable Performance Report", false, true}; // Show certain perf-related logs
|
||||
} core{ this };
|
||||
|
||||
struct node_vfs : cfg::node
|
||||
|
@ -107,6 +107,7 @@
|
||||
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" />
|
||||
<ClCompile Include="Emu\system_config_types.cpp" />
|
||||
<ClCompile Include="Emu\perf_meter.cpp" />
|
||||
<ClCompile Include="Emu\title.cpp" />
|
||||
<ClCompile Include="Emu\system_config.cpp" />
|
||||
<ClCompile Include="Emu\NP\fb_helpers.cpp" />
|
||||
@ -758,6 +759,7 @@
|
||||
<ClInclude Include="Emu\RSX\rsx_methods.h" />
|
||||
<ClInclude Include="Emu\RSX\rsx_utils.h" />
|
||||
<ClInclude Include="Emu\System.h" />
|
||||
<ClInclude Include="Emu\perf_meter.hpp" />
|
||||
<ClInclude Include="Emu\GDB.h" />
|
||||
<ClInclude Include="Loader\ELF.h" />
|
||||
<ClInclude Include="Loader\PSF.h" />
|
||||
|
@ -971,6 +971,9 @@
|
||||
<ClCompile Include="Emu\localized_string.cpp">
|
||||
<Filter>Emu</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\perf_meter.cpp">
|
||||
<Filter>Emu</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Crypto\aes.h">
|
||||
@ -1867,6 +1870,9 @@
|
||||
<ClInclude Include="Emu\localized_string_id.h">
|
||||
<Filter>Emu</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\perf_meter.hpp">
|
||||
<Filter>Emu</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="Emu\RSX\Common\Interpreter\FragmentInterpreter.glsl">
|
||||
|
Loading…
x
Reference in New Issue
Block a user