From 43952e18e264abdeb15979dae419a8f0ecfc74e2 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 24 Nov 2020 08:18:31 +0300 Subject: [PATCH] Implement prefetch_write() and prefetch_exec() wrappers Do some refactoring to prefetch_read() in util/asm.hpp as well. Make all these function constexpr because they are no-ops. --- Utilities/Thread.cpp | 1 + rpcs3/Emu/CPU/CPUThread.cpp | 6 ++--- rpcs3/Emu/Cell/PPUThread.cpp | 5 ++-- rpcs3/Emu/Cell/SPUThread.cpp | 6 ++--- rpcs3/util/asm.hpp | 52 ++++++++++++++++++++++++++++-------- rpcs3/util/atomic.cpp | 10 ++++--- 6 files changed, 57 insertions(+), 23 deletions(-) diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index 168d8a07d7..1e750f66d7 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -2139,6 +2139,7 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept } // Return new entry point + utils::prefetch_exec((*tls)->entry_point); return (*tls)->entry_point; } diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index 0567e367a6..41ecbdee0e 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -869,7 +869,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept { if (cpu != _this) { - _m_prefetchw(&cpu->state); + utils::prefetch_write(&cpu->state); return true; } @@ -946,13 +946,13 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept { for (u32 i = 0; i < work->prf_size; i++) { - _m_prefetchw(work->prf_list[0]); + utils::prefetch_write(work->prf_list[0]); } } cpu_counter::for_all_cpu(copy2, [&](cpu_thread* cpu) { - _m_prefetchw(&cpu->state); + utils::prefetch_write(&cpu->state); return true; }); diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 986545512f..76425d2f58 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -64,6 +64,7 @@ #include #include #include +#include "util/asm.hpp" #include "util/vm.hpp" const bool s_use_ssse3 = utils::has_ssse3(); @@ -1749,8 +1750,8 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) return false; } - _m_prefetchw(ppu.rdata); - _m_prefetchw(ppu.rdata + 64); + utils::prefetch_read(ppu.rdata); + utils::prefetch_read(ppu.rdata + 64); ppu.last_faddr = addr; ppu.last_ftime = res.load() & -128; ppu.last_ftsc = __rdtsc(); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 5d28d56b55..0640515959 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1932,7 +1932,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* range_lock = _this->range_lock; } - _m_prefetchw(range_lock); + utils::prefetch_write(range_lock); for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0) { @@ -2667,8 +2667,8 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return false; } - _m_prefetchw(rdata); - _m_prefetchw(rdata + 64); + utils::prefetch_read(rdata); + utils::prefetch_read(rdata + 64); last_faddr = addr; last_ftime = res.load() & -128; last_ftsc = __rdtsc(); diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index a2291137ea..31fe5bbee9 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -67,17 +67,52 @@ namespace utils } }; -#if defined(__GNUG__) - - inline void prefetch_read(const void* ptr) + // Try to prefetch to Level 2 cache since it's not split to data/code on most processors + template + constexpr void prefetch_exec(T func) { -#if __has_builtin(__builtin_prefetch) - return __builtin_prefetch(ptr); + if (std::is_constant_evaluated()) + { + return; + } + + const u64 value = reinterpret_cast(func); + const void* ptr = reinterpret_cast(value); + +#ifdef _MSC_VER + return _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T1); #else - __asm__ volatile ("prefetcht0 0(%[ptr])" : : [ptr] "r" (ptr)); + return __builtin_prefetch(ptr, 0, 2); #endif } + // Try to prefetch to Level 1 cache + constexpr void prefetch_read(const void* ptr) + { + if (std::is_constant_evaluated()) + { + return; + } + +#ifdef _MSC_VER + return _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T0); +#else + return __builtin_prefetch(ptr, 0, 3); +#endif + } + + constexpr void prefetch_write(void* ptr) + { + if (std::is_constant_evaluated()) + { + return; + } + + return _m_prefetchw(ptr); + } + +#if defined(__GNUG__) + inline u8 rol8(u8 x, u8 n) { #if __has_builtin(__builtin_rotateleft8) @@ -231,11 +266,6 @@ namespace utils } #elif defined(_MSC_VER) - inline void prefetch_read(const void* ptr) - { - return _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T0); - } - inline u8 rol8(u8 x, u8 n) { return _rotl8(x, n); diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index 62dbda0d58..8f90a7fa4a 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -680,15 +680,17 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1) } // Call the destructor if necessary - cond->destroy(); + utils::prefetch_write(s_cond_bits + cond_id / 64); const u32 level3 = cond_id / 64 % 16; const u32 level2 = cond_id / 1024 % 8; const u32 level1 = cond_id / 8192 % 8; - _m_prefetchw(s_cond_sem3 + level2); - _m_prefetchw(s_cond_sem2 + level1); - _m_prefetchw(&s_cond_sem1); + utils::prefetch_write(s_cond_sem3 + level2); + utils::prefetch_write(s_cond_sem2 + level1); + utils::prefetch_write(&s_cond_sem1); + + cond->destroy(); // Release the semaphore tree in the reverse order s_cond_bits[cond_id / 64] &= ~(1ull << (cond_id % 64));