mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-14 10:21:21 +00:00
LV2: Improve sys_timer_usleep by using CPU usermode waiting
* Linux: set timerslack to minimum value - Linux delays the wakeup of threads to save power, this feature isn't needed for this application * Utils: Add detection for waitpkg and monitorx extensions - These instructions are used for user mode wait instructions * lv2: Use user mode wait instructions instead of yielding when appropriate
This commit is contained in:
parent
aee97e414f
commit
d4cf12bc17
@ -53,6 +53,17 @@
|
||||
#include <optional>
|
||||
#include <deque>
|
||||
#include "util/tsc.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
extern std::string ppu_get_syscall_name(u64 code);
|
||||
|
||||
@ -1880,6 +1891,35 @@ void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc)
|
||||
g_lv2_preempts_taken.release(0);
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define mwaitx_func
|
||||
#define waitpkg_func
|
||||
#else
|
||||
#define mwaitx_func __attribute__((__target__("mwaitx")))
|
||||
#define waitpkg_func __attribute__((__target__("waitpkg")))
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
// Waits for a number of TSC clock cycles in power optimized state
|
||||
// Cstate is represented in bits [7:4]+1 cstate. So C0 requires bits [7:4] to be set to 0xf, C1 requires bits [7:4] to be set to 0.
|
||||
mwaitx_func static void __mwaitx(u32 cycles, u32 cstate)
|
||||
{
|
||||
constexpr u32 timer_enable = 0x2;
|
||||
|
||||
// monitorx will wake if the cache line is written to. We don't want this, so place the monitor value on it's own cache line.
|
||||
alignas(64) u64 monitor_var{};
|
||||
_mm_monitorx(&monitor_var, 0, 0);
|
||||
_mm_mwaitx(timer_enable, cstate, cycles);
|
||||
}
|
||||
|
||||
// First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power)
|
||||
waitpkg_func static void __tpause(u32 cycles, u32 cstate)
|
||||
{
|
||||
const u64 tsc = utils::get_tsc() + cycles;
|
||||
_tpause(cstate, tsc);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
|
||||
{
|
||||
static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
|
||||
@ -1965,6 +2005,7 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
|
||||
if (remaining > host_min_quantum)
|
||||
{
|
||||
#ifdef __linux__
|
||||
// With timerslack set low, Linux is precise for all values above
|
||||
wait_for(remaining);
|
||||
#else
|
||||
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
|
||||
@ -1972,6 +2013,21 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
|
||||
#endif
|
||||
}
|
||||
// TODO: Determine best value for yield delay
|
||||
#if defined(ARCH_X64)
|
||||
else if (utils::has_appropriate_um_wait())
|
||||
{
|
||||
u32 us_in_tsc_clocks = remaining * (utils::get_tsc_freq() / 1000000);
|
||||
|
||||
if (utils::has_waitpkg())
|
||||
{
|
||||
__tpause(us_in_tsc_clocks, 0x1);
|
||||
}
|
||||
else
|
||||
{
|
||||
__mwaitx(us_in_tsc_clocks, 0xf0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure
|
||||
|
@ -1049,6 +1049,11 @@ int main(int argc, char** argv)
|
||||
}
|
||||
}
|
||||
|
||||
// Set timerslack value for Linux. The default value is 50,000ns. Change this to just 1 since we value precise timers.
|
||||
#ifdef __linux__
|
||||
prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
// Create dummy permanent low resolution timer to workaround messing with system timer resolution
|
||||
QTimer* dummy_timer = new QTimer(app.data());
|
||||
|
@ -298,7 +298,7 @@ bool utils::has_fma4()
|
||||
bool utils::has_fast_vperm2b()
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20) == 0x20;
|
||||
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x40) == 0x40;
|
||||
return g_value;
|
||||
#else
|
||||
return false;
|
||||
@ -325,6 +325,39 @@ bool utils::has_fsrm()
|
||||
#endif
|
||||
}
|
||||
|
||||
bool utils::has_waitx()
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20000000) == 0x20000000;
|
||||
return g_value;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool utils::has_waitpkg()
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[2] & 0x20) == 0x20;
|
||||
return g_value;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// User mode waits may be unfriendly to low thread CPUs
|
||||
// Filter out systems with less than 8 threads for linux and less than 12 threads for other platforms
|
||||
bool utils::has_appropriate_um_wait()
|
||||
{
|
||||
#ifdef __linux__
|
||||
static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 8) && get_tsc_freq();
|
||||
return g_value;
|
||||
#else
|
||||
static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 12) && get_tsc_freq();
|
||||
return g_value;
|
||||
#endif
|
||||
}
|
||||
|
||||
u32 utils::get_rep_movsb_threshold()
|
||||
{
|
||||
static const u32 g_value = []()
|
||||
|
@ -53,6 +53,12 @@ namespace utils
|
||||
|
||||
bool has_fsrm();
|
||||
|
||||
bool has_waitx();
|
||||
|
||||
bool has_waitpkg();
|
||||
|
||||
bool has_appropriate_um_wait();
|
||||
|
||||
std::string get_cpu_brand();
|
||||
|
||||
std::string get_system_info();
|
||||
|
Loading…
x
Reference in New Issue
Block a user