Implement perf stat counter for PPU/SPU reservation ops

Adds Emu/perf_meter.hpp header file.
Uses RDTSC for speed.
Prints stats at exit.
This commit is contained in:
Nekotekina 2020-10-18 15:00:10 +03:00
parent adf50b7c4b
commit 120849c734
10 changed files with 281 additions and 8 deletions

View File

@ -7,6 +7,7 @@
VFS.cpp
GDB.cpp
title.cpp
perf_meter.cpp
)
# prevent WolfSSL from warning about not having harden options

View File

@ -8,6 +8,7 @@
#include "Emu/GDB.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/perf_meter.hpp"
#include <thread>
#include <unordered_map>
@ -775,6 +776,9 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
if (!next)
{
// Monitor the performance only of the actual suspend processing owner
perf_meter<"SUSPEND"_u64> perf0;
// First thread to push the work to the workload list pauses all threads and processes it
std::lock_guard lock(ctr->cpu_suspend_lock);
@ -911,13 +915,12 @@ void cpu_thread::stop_all() noexcept
std::this_thread::sleep_for(10ms);
}
sys_log.notice("All CPU threads have been stopped. [+: %u; suspends: %u]", +g_threads_created, +g_suspend_counter);
sys_log.notice("All CPU threads have been stopped. [+: %u]", +g_threads_created);
std::lock_guard lock(g_fxo->get<cpu_counter>()->cpu_suspend_lock);
g_threads_deleted -= g_threads_created.load();
g_threads_created = 0;
g_suspend_counter = 0;
}
void cpu_thread::flush_profilers() noexcept

View File

@ -3,6 +3,7 @@
#include "Utilities/sysinfo.h"
#include "Utilities/JIT.h"
#include "Crypto/sha1.h"
#include "Emu/perf_meter.hpp"
#include "Emu/Memory/vm_reservation.h"
#include "Emu/RSX/RSXThread.h"
#include "Emu/VFS.h"
@ -1282,9 +1283,9 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8);
}
if (count >= 15) [[unlikely]]
if (count >= 15 && g_cfg.core.perf_report) [[unlikely]]
{
ppu_log.warning("%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count);
perf_log.warning("%s: took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count);
}
ppu.rtime &= ~vm::rsrv_shared_mask;
@ -1553,6 +1554,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
template <typename T>
static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
{
perf_meter<"STCX"_u32> perf0;
if (addr % sizeof(T))
{
fmt::throw_exception("PPU %s: Unaligned address: 0x%08x" HERE, sizeof(T) == 4 ? "STWCX" : "STDCX", addr);

View File

@ -10,6 +10,7 @@
#include "Loader/ELF.h"
#include "Emu/VFS.h"
#include "Emu/IdManager.h"
#include "Emu/perf_meter.hpp"
#include "Emu/RSX/RSXThread.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/Cell/ErrorCodes.h"
@ -2055,11 +2056,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
bool spu_thread::do_putllc(const spu_mfc_cmd& args)
{
perf_meter<"PUTLLC-"_u64> perf0;
perf_meter<"PUTLLC+"_u64> perf1 = perf0;
// Store conditionally
const u32 addr = args.eal & -128;
if ([&]()
{
perf_meter<"PUTLLC."_u64> perf2 = perf0;
if (raddr != addr)
{
return false;
@ -2163,6 +2169,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
{
vm::reservation_notifier(addr, 128).notify_all();
raddr = 0;
perf0.reset();
return true;
}
else
@ -2182,12 +2189,15 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
}
raddr = 0;
perf1.reset();
return false;
}
}
void do_cell_atomic_128_store(u32 addr, const void* to_write)
{
perf_meter<"STORE128"_u64> perf0;
const auto cpu = get_current_cpu_thread();
if (g_use_rtm) [[likely]]
@ -2237,6 +2247,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
void spu_thread::do_putlluc(const spu_mfc_cmd& args)
{
perf_meter<"PUTLLUC"_u64> perf0;
const u32 addr = args.eal & -128;
if (raddr && addr == raddr)
@ -2419,6 +2431,8 @@ bool spu_thread::process_mfc_cmd()
std::this_thread::yield();
}
perf_meter<"GETLLAR"_u64> perf0;
alignas(64) spu_rdata_t temp;
u64 ntime;
@ -2506,9 +2520,9 @@ bool spu_thread::process_mfc_cmd()
continue;
}
if (g_use_rtm && i >= 15) [[unlikely]]
if (g_use_rtm && i >= 15 && g_cfg.core.perf_report) [[unlikely]]
{
spu_log.warning("GETLLAR took too long: %u", i);
perf_log.warning("GETLLAR: took too long: %u", i);
}
break;

View File

@ -14,7 +14,7 @@
#include "Emu/Cell/lv2/sys_memory.h"
#include "Emu/RSX/GSRender.h"
#include "Emu/Cell/SPURecompiler.h"
#include <atomic>
#include "Emu/perf_meter.hpp"
#include <thread>
#include <deque>
@ -391,6 +391,8 @@ namespace vm
if (addr >= 0x10000)
{
perf_meter<"SUSPEND"_u64> perf0;
for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++)
{
if (auto ptr = +*lock; ptr && !(ptr->state & cpu_flag::memory))

58
rpcs3/Emu/perf_meter.cpp Normal file
View File

@ -0,0 +1,58 @@
#include "stdafx.h"
#include "perf_meter.hpp"
void perf_stat_base::push(u64 ns[66]) noexcept
{
for (u32 i = 0; i < 66; i++)
{
m_log[i] += ns[i];
}
}
void perf_stat_base::print(const char* name) noexcept
{
if (u64 num_total = m_log[0].load())
{
perf_log.notice("Perf stats for %s: total events: %u (total time %.4fs)", name, num_total, m_log[65].load() / 1000'000'000.);
for (u32 i = 0; i < 13; i++)
{
if (u64 count = m_log[i + 1].load())
{
perf_log.notice("Perf stats for %s: events < %.3fµs: %u", name, std::pow(2., i) / 1000., count);
}
}
for (u32 i = 14; i < 23; i++)
{
if (u64 count = m_log[i + 1].load()) [[unlikely]]
{
perf_log.notice("Perf stats for %s: events < %.3fms: %u", name, std::pow(2., i) / 1000'000., count);
}
}
for (u32 i = 24; i < 33; i++)
{
if (u64 count = m_log[i + 1].load()) [[unlikely]]
{
perf_log.notice("Perf stats for %s: events < %.3fs: %u", name, std::pow(2., i) / 1000'000'000., count);
}
}
for (u32 i = 34; i < 43; i++)
{
if (u64 count = m_log[i + 1].load()) [[unlikely]]
{
perf_log.notice("Perf stats for %s: events < %.0f SEC: %u", name, std::pow(2., i) / 1000'000'000., count);
}
}
for (u32 i = 44; i < 63; i++)
{
if (u64 count = m_log[i + 1].load()) [[unlikely]]
{
perf_log.notice("Perf stats for %s: events < %.0f MIN: %u", name, std::pow(2., i) / 60'000'000'000., count);
}
}
}
}

181
rpcs3/Emu/perf_meter.hpp Normal file
View File

@ -0,0 +1,181 @@
#pragma once
#include "Utilities/types.h"
#include "util/logs.hpp"
#include "Utilities/sysinfo.h"
#include "system_config.h"
#include "IdManager.h"
#include <array>
#include <cmath>
#include <thread>
#include <unordered_map>
LOG_CHANNEL(perf_log, "PERF");
// TODO: constexpr with the help of bitcast
template <auto Name>
inline const auto perf_name = []
{
constexpr auto short_name = Name;
std::array<char, sizeof(Name) + 1> result{};
std::memcpy(result.data(), &short_name, sizeof(Name));
return result;
}();
class perf_stat_base
{
atomic_t<u64> m_log[66]{};
protected:
// Print accumulated values
void print(const char* name) noexcept;
// Accumulate values from a thread
void push(u64 ns[66]) noexcept;
public:
perf_stat_base() noexcept = default;
perf_stat_base(const perf_stat_base&) = delete;
perf_stat_base& operator =(const perf_stat_base&) = delete;
~perf_stat_base() {}
};
// Object that prints event length stats at the end
template <auto ShortName>
class perf_stat final : public perf_stat_base
{
static inline thread_local struct perf_stat_local
{
// Local non-atomic values for increments
u64 m_log[66]{};
~perf_stat_local()
{
// Update on thread exit
g_fxo->get<perf_stat>()->perf_stat_base::push(m_log);
}
} g_tls_perf_stat;
public:
~perf_stat()
{
perf_stat_base::print(perf_name<ShortName>.data());
}
void push(u64 ns) noexcept
{
auto& data = g_tls_perf_stat.m_log;
data[0] += ns != 0;
data[64 - std::countl_zero(ns)]++;
data[65] += ns;
}
};
// Object that prints event length at the end
template <auto ShortName, auto... SubEvents>
class perf_meter
{
// Initialize array (possibly only 1 element) with timestamp
u64 m_timestamps[1 + sizeof...(SubEvents)] = {__rdtsc()};
public:
SAFE_BUFFERS perf_meter() noexcept
{
m_timestamps[0] = __rdtsc();
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
}
// Copy first timestamp
template <auto SN, auto... S>
SAFE_BUFFERS perf_meter(const perf_meter<SN, S...>& r) noexcept
{
m_timestamps[0] = r.get();
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
}
template <auto SN, auto... S>
SAFE_BUFFERS perf_meter(perf_meter<SN, S...>&& r) noexcept
{
m_timestamps[0] = r.get();
r.reset();
}
// Copy first timestamp
template <auto SN, auto... S>
SAFE_BUFFERS perf_meter& operator =(const perf_meter<SN, S...>& r) noexcept
{
m_timestamps[0] = r.get();
return *this;
}
template <auto SN, auto... S>
SAFE_BUFFERS perf_meter& operator =(perf_meter<SN, S...>& r) noexcept
{
m_timestamps[0] = r.get();
r.reset();
return *this;
}
// Push subevent data in array
template <auto Event, std::size_t Index = 0>
SAFE_BUFFERS void push() noexcept
{
// TODO: should use more efficient search with type comparison, then value comparison, or pattern matching
if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
{
// Push actual timestamp into an array
m_timestamps[Index + 1] = __rdtsc();
}
else if constexpr (Index < sizeof...(SubEvents))
{
// Proceed search recursively
push<Event, Index + 1>();
}
}
// Obtain initial timestamp
u64 get() const noexcept
{
return m_timestamps[0];
}
// Disable this counter
SAFE_BUFFERS void reset() noexcept
{
m_timestamps[0] = 0;
}
SAFE_BUFFERS ~perf_meter()
{
// Disabled counter
if (!m_timestamps[0]) [[unlikely]]
{
return;
}
if (!g_cfg.core.perf_report) [[likely]]
{
return;
}
// Event end
const u64 end_time = __rdtsc();
// Compute difference in seconds
const f64 diff = (end_time - m_timestamps[0]) * 1. / utils::get_tsc_freq();
// Register perf stat in nanoseconds
g_fxo->get<perf_stat<ShortName>>()->push(static_cast<u64>(diff * 1000'000'000.));
// Print in microseconds
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
{
perf_log.notice("%s: %.3fµs", perf_name<ShortName>.data(), diff * 1000'000.);
}
// TODO: handle push(), currently ignored
}
};

View File

@ -53,7 +53,7 @@ struct cfg_root : cfg::node
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM
cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
@ -73,6 +73,9 @@ struct cfg_root : cfg::node
#else
sleep_timers_accuracy_level::_usleep, true };
#endif
cfg::uint64 perf_report_threshold{this, "Performance Report Threshold", 500, true}; // In µs, 0.5ms = default, 0 = everything
cfg::_bool perf_report{this, "Enable Performance Report", false, true}; // Show certain perf-related logs
} core{ this };
struct node_vfs : cfg::node

View File

@ -107,6 +107,7 @@
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" />
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" />
<ClCompile Include="Emu\system_config_types.cpp" />
<ClCompile Include="Emu\perf_meter.cpp" />
<ClCompile Include="Emu\title.cpp" />
<ClCompile Include="Emu\system_config.cpp" />
<ClCompile Include="Emu\NP\fb_helpers.cpp" />
@ -758,6 +759,7 @@
<ClInclude Include="Emu\RSX\rsx_methods.h" />
<ClInclude Include="Emu\RSX\rsx_utils.h" />
<ClInclude Include="Emu\System.h" />
<ClInclude Include="Emu\perf_meter.hpp" />
<ClInclude Include="Emu\GDB.h" />
<ClInclude Include="Loader\ELF.h" />
<ClInclude Include="Loader\PSF.h" />

View File

@ -971,6 +971,9 @@
<ClCompile Include="Emu\localized_string.cpp">
<Filter>Emu</Filter>
</ClCompile>
<ClCompile Include="Emu\perf_meter.cpp">
<Filter>Emu</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Crypto\aes.h">
@ -1867,6 +1870,9 @@
<ClInclude Include="Emu\localized_string_id.h">
<Filter>Emu</Filter>
</ClInclude>
<ClInclude Include="Emu\perf_meter.hpp">
<Filter>Emu</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="Emu\RSX\Common\Interpreter\FragmentInterpreter.glsl">