Implement prefetch_write() and prefetch_exec() wrappers

Do some refactoring to prefetch_read() in util/asm.hpp as well.
Make all these function constexpr because they are no-ops.
This commit is contained in:
Nekotekina 2020-11-24 08:18:31 +03:00
parent 5076da8f77
commit 43952e18e2
6 changed files with 57 additions and 23 deletions

View File

@ -2139,6 +2139,7 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
}
// Return new entry point
utils::prefetch_exec((*tls)->entry_point);
return (*tls)->entry_point;
}

View File

@ -869,7 +869,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
{
if (cpu != _this)
{
_m_prefetchw(&cpu->state);
utils::prefetch_write(&cpu->state);
return true;
}
@ -946,13 +946,13 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
{
for (u32 i = 0; i < work->prf_size; i++)
{
_m_prefetchw(work->prf_list[0]);
utils::prefetch_write(work->prf_list[0]);
}
}
cpu_counter::for_all_cpu(copy2, [&](cpu_thread* cpu)
{
_m_prefetchw(&cpu->state);
utils::prefetch_write(&cpu->state);
return true;
});

View File

@ -64,6 +64,7 @@
#include <thread>
#include <cfenv>
#include <cctype>
#include "util/asm.hpp"
#include "util/vm.hpp"
const bool s_use_ssse3 = utils::has_ssse3();
@ -1749,8 +1750,8 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
_m_prefetchw(ppu.rdata);
_m_prefetchw(ppu.rdata + 64);
utils::prefetch_read(ppu.rdata);
utils::prefetch_read(ppu.rdata + 64);
ppu.last_faddr = addr;
ppu.last_ftime = res.load() & -128;
ppu.last_ftsc = __rdtsc();

View File

@ -1932,7 +1932,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
range_lock = _this->range_lock;
}
_m_prefetchw(range_lock);
utils::prefetch_write(range_lock);
for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0)
{
@ -2667,8 +2667,8 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
return false;
}
_m_prefetchw(rdata);
_m_prefetchw(rdata + 64);
utils::prefetch_read(rdata);
utils::prefetch_read(rdata + 64);
last_faddr = addr;
last_ftime = res.load() & -128;
last_ftsc = __rdtsc();

View File

@ -67,17 +67,52 @@ namespace utils
}
};
#if defined(__GNUG__)
inline void prefetch_read(const void* ptr)
// Try to prefetch to Level 2 cache since it's not split to data/code on most processors
template <typename T>
constexpr void prefetch_exec(T func)
{
#if __has_builtin(__builtin_prefetch)
return __builtin_prefetch(ptr);
if (std::is_constant_evaluated())
{
return;
}
const u64 value = reinterpret_cast<u64>(func);
const void* ptr = reinterpret_cast<const void*>(value);
#ifdef _MSC_VER
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T1);
#else
__asm__ volatile ("prefetcht0 0(%[ptr])" : : [ptr] "r" (ptr));
return __builtin_prefetch(ptr, 0, 2);
#endif
}
// Try to prefetch to Level 1 cache
constexpr void prefetch_read(const void* ptr)
{
if (std::is_constant_evaluated())
{
return;
}
#ifdef _MSC_VER
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
#else
return __builtin_prefetch(ptr, 0, 3);
#endif
}
constexpr void prefetch_write(void* ptr)
{
if (std::is_constant_evaluated())
{
return;
}
return _m_prefetchw(ptr);
}
#if defined(__GNUG__)
inline u8 rol8(u8 x, u8 n)
{
#if __has_builtin(__builtin_rotateleft8)
@ -231,11 +266,6 @@ namespace utils
}
#elif defined(_MSC_VER)
inline void prefetch_read(const void* ptr)
{
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
}
inline u8 rol8(u8 x, u8 n)
{
return _rotl8(x, n);

View File

@ -680,15 +680,17 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
}
// Call the destructor if necessary
cond->destroy();
utils::prefetch_write(s_cond_bits + cond_id / 64);
const u32 level3 = cond_id / 64 % 16;
const u32 level2 = cond_id / 1024 % 8;
const u32 level1 = cond_id / 8192 % 8;
_m_prefetchw(s_cond_sem3 + level2);
_m_prefetchw(s_cond_sem2 + level1);
_m_prefetchw(&s_cond_sem1);
utils::prefetch_write(s_cond_sem3 + level2);
utils::prefetch_write(s_cond_sem2 + level1);
utils::prefetch_write(&s_cond_sem1);
cond->destroy();
// Release the semaphore tree in the reverse order
s_cond_bits[cond_id / 64] &= ~(1ull << (cond_id % 64));