Replace utils::cntlz{32,64} with std::countl_zero

This commit is contained in:
Nekotekina 2020-04-13 21:57:16 +03:00
parent d0c199d455
commit 032e7c0491
11 changed files with 22 additions and 47 deletions

View File

@ -4,30 +4,6 @@
namespace utils namespace utils
{ {
inline u32 cntlz32(u32 arg, bool nonzero = false)
{
#ifdef _MSC_VER
ulong res;
return _BitScanReverse(&res, arg) || nonzero ? res ^ 31 : 32;
#elif __LZCNT__
return _lzcnt_u32(arg);
#else
return arg || nonzero ? __builtin_clz(arg) : 32;
#endif
}
inline u64 cntlz64(u64 arg, bool nonzero = false)
{
#ifdef _MSC_VER
ulong res;
return _BitScanReverse64(&res, arg) || nonzero ? res ^ 63 : 64;
#elif __LZCNT__
return _lzcnt_u64(arg);
#else
return arg || nonzero ? __builtin_clzll(arg) : 64;
#endif
}
inline u8 popcnt32(u32 arg) inline u8 popcnt32(u32 arg)
{ {
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -59,7 +59,7 @@ std::size_t cfmt_append(Dst& out, const Char* fmt, Src&& src)
const auto write_octal = [&](u64 value, u64 min_num) const auto write_octal = [&](u64 value, u64 min_num)
{ {
out.resize(out.size() + std::max<u64>(min_num, 66 / 3 - (utils::cntlz64(value | 1, true) + 2) / 3), '0'); out.resize(out.size() + std::max<u64>(min_num, 66 / 3 - (std::countl_zero<u64>(value | 1) + 2) / 3), '0');
// Write in reversed order // Write in reversed order
for (auto i = out.rbegin(); value; i++, value /= 8) for (auto i = out.rbegin(); value; i++, value /= 8)
@ -70,7 +70,7 @@ std::size_t cfmt_append(Dst& out, const Char* fmt, Src&& src)
const auto write_hex = [&](u64 value, bool upper, u64 min_num) const auto write_hex = [&](u64 value, bool upper, u64 min_num)
{ {
out.resize(out.size() + std::max<u64>(min_num, 64 / 4 - utils::cntlz64(value | 1, true) / 4), '0'); out.resize(out.size() + std::max<u64>(min_num, 64 / 4 - std::countl_zero<u64>(value | 1) / 4), '0');
// Write in reversed order // Write in reversed order
for (auto i = out.rbegin(); value; i++, value /= 16) for (auto i = out.rbegin(); value; i++, value /= 16)

View File

@ -2114,7 +2114,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
const u32 wmax = spurs->flags1 & SF1_32_WORKLOADS ? 0x20u : 0x10u; // TODO: check if can be changed const u32 wmax = spurs->flags1 & SF1_32_WORKLOADS ? 0x20u : 0x10u; // TODO: check if can be changed
spurs->wklEnabled.atomic_op([spurs, wmax, &wnum](be_t<u32>& value) spurs->wklEnabled.atomic_op([spurs, wmax, &wnum](be_t<u32>& value)
{ {
wnum = utils::cntlz32(~value); // found empty position wnum = std::countl_one<u32>(value); // found empty position
if (wnum < wmax) if (wnum < wmax)
{ {
value |= (0x80000000 >> wnum); // set workload bit value |= (0x80000000 >> wnum); // set workload bit
@ -2237,7 +2237,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
else else
{ {
k |= 0x80000000 >> current->uniqueId; k |= 0x80000000 >> current->uniqueId;
res_wkl = utils::cntlz32(~k); res_wkl = std::countl_one<u32>(k);
} }
} }
} }

View File

@ -1017,7 +1017,7 @@ error_code _cellSyncLFQueueCompletePushPointer(ppu_thread& ppu, vm::ptr<CellSync
{ {
var9_ = 1 << var9_; var9_ = 1 << var9_;
} }
s32 var9 = utils::cntlz32(static_cast<u16>(~(var9_ | push3.m_h6))) - 16; // count leading zeros in u16 s32 var9 = std::countl_zero<u32>(static_cast<u16>(~(var9_ | push3.m_h6))) - 16; // count leading zeros in u16
s32 var5 = push3.m_h6 | var9_; s32 var5 = push3.m_h6 | var9_;
if (var9 & 0x30) if (var9 & 0x30)
@ -1317,7 +1317,8 @@ error_code _cellSyncLFQueueCompletePopPointer(ppu_thread& ppu, vm::ptr<CellSyncL
{ {
var9_ = 1 << var9_; var9_ = 1 << var9_;
} }
s32 var9 = utils::cntlz32(static_cast<u16>(~(var9_ | pop3.m_h2))) - 16; // count leading zeros in u16
s32 var9 = std::countl_zero<u32>(static_cast<u16>(~(var9_ | pop3.m_h2))) - 16; // count leading zeros in u16
s32 var5 = pop3.m_h2 | var9_; s32 var5 = pop3.m_h2 | var9_;
if (var9 & 0x30) if (var9 & 0x30)

View File

@ -1238,7 +1238,7 @@ struct ppu_acontext
if (min < max) if (min < max)
{ {
// Inverted constant MSB mask // Inverted constant MSB mask
const u64 mix = ~0ull >> utils::cntlz64(min ^ max, true); const u64 mix = ~0ull >> std::countl_zero(min ^ max);
r.bmin |= min & ~mix; r.bmin |= min & ~mix;
r.bmax &= max | mix; r.bmax &= max | mix;

View File

@ -3250,7 +3250,7 @@ bool ppu_interpreter::MFOCRF(ppu_thread& ppu, ppu_opcode_t op)
if (op.l11) if (op.l11)
{ {
// MFOCRF // MFOCRF
const u32 n = utils::cntlz32(op.crm) & 7; const u32 n = std::countl_zero<u32>(op.crm) & 7;
const u32 p = n * 4; const u32 p = n * 4;
const u32 v = ppu.cr[p + 0] << 3 | ppu.cr[p + 1] << 2 | ppu.cr[p + 2] << 1 | ppu.cr[p + 3] << 0; const u32 v = ppu.cr[p + 0] << 3 | ppu.cr[p + 1] << 2 | ppu.cr[p + 2] << 1 | ppu.cr[p + 3] << 0;
@ -3299,7 +3299,7 @@ bool ppu_interpreter::SLW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::CNTLZW(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::CNTLZW(ppu_thread& ppu, ppu_opcode_t op)
{ {
ppu.gpr[op.ra] = utils::cntlz32(static_cast<u32>(ppu.gpr[op.rs])); ppu.gpr[op.ra] = std::countl_zero(static_cast<u32>(ppu.gpr[op.rs]));
if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0); if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
return true; return true;
} }
@ -3377,7 +3377,7 @@ bool ppu_interpreter::LWZUX(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::CNTLZD(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::CNTLZD(ppu_thread& ppu, ppu_opcode_t op)
{ {
ppu.gpr[op.ra] = utils::cntlz64(ppu.gpr[op.rs]); ppu.gpr[op.ra] = std::countl_zero(ppu.gpr[op.rs]);
if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0); if (op.rc) [[unlikely]] ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
return true; return true;
} }
@ -3537,7 +3537,7 @@ bool ppu_interpreter::MTOCRF(ppu_thread& ppu, ppu_opcode_t op)
{ {
// MTOCRF // MTOCRF
const u32 n = utils::cntlz32(op.crm) & 7; const u32 n = std::countl_zero<u32>(op.crm) & 7;
const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf; const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf;
ppu.cr.fields[n] = *reinterpret_cast<const u32*>(s_table + v); ppu.cr.fields[n] = *reinterpret_cast<const u32*>(s_table + v);
} }

View File

@ -675,7 +675,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
} }
// Determine which value will be duplicated at hole positions // Determine which value will be duplicated at hole positions
const u32 w3 = func.data.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4); const u32 w3 = func.data.at((j - start + ~std::countl_zero(cmask) % 4 * 4) / 4);
words.push_back(cmask & 1 ? func.data[(j - start + 0) / 4] : w3); words.push_back(cmask & 1 ? func.data[(j - start + 0) / 4] : w3);
words.push_back(cmask & 2 ? func.data[(j - start + 4) / 4] : w3); words.push_back(cmask & 2 ? func.data[(j - start + 4) / 4] : w3);
words.push_back(cmask & 4 ? func.data[(j - start + 8) / 4] : w3); words.push_back(cmask & 4 ? func.data[(j - start + 8) / 4] : w3);

View File

@ -903,7 +903,7 @@ bool spu_interpreter::CLZ(spu_thread& spu, spu_opcode_t op)
{ {
for (u32 i = 0; i < 4; i++) for (u32 i = 0; i < 4; i++)
{ {
spu.gpr[op.rt]._u32[i] = utils::cntlz32(spu.gpr[op.ra]._u32[i]); spu.gpr[op.rt]._u32[i] = std::countl_zero(spu.gpr[op.ra]._u32[i]);
} }
return true; return true;
} }

View File

@ -697,7 +697,7 @@ namespace vm
const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0); const u32 size = ::align(orig_size, min_page_size) + (flags & 0x10 ? 0x2000 : 0);
// Check alignment (it's page allocation, so passing small values there is just silly) // Check alignment (it's page allocation, so passing small values there is just silly)
if (align < min_page_size || align != (0x80000000u >> utils::cntlz32(align, true))) if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align)))
{ {
fmt::throw_exception("Invalid alignment (size=0x%x, align=0x%x)" HERE, size, align); fmt::throw_exception("Invalid alignment (size=0x%x, align=0x%x)" HERE, size, align);
} }
@ -992,7 +992,7 @@ namespace vm
const u32 size = ::align(orig_size, 0x10000); const u32 size = ::align(orig_size, 0x10000);
// Check alignment // Check alignment
if (align < 0x10000 || align != (0x80000000u >> utils::cntlz32(align, true))) if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align)))
{ {
fmt::throw_exception("Invalid alignment (size=0x%x, align=0x%x)" HERE, size, align); fmt::throw_exception("Invalid alignment (size=0x%x, align=0x%x)" HERE, size, align);
} }

View File

@ -3,7 +3,6 @@
#include "../system_config.h" #include "../system_config.h"
#include "Utilities/address_range.h" #include "Utilities/address_range.h"
#include "Utilities/geometry.h" #include "Utilities/geometry.h"
#include "Utilities/asm.h"
#include "gcm_enums.h" #include "gcm_enums.h"
#include <memory> #include <memory>
@ -239,19 +238,19 @@ namespace rsx
// //
static inline u32 floor_log2(u32 value) static inline u32 floor_log2(u32 value)
{ {
return value <= 1 ? 0 : utils::cntlz32(value, true) ^ 31; return value <= 1 ? 0 : std::countl_zero(value) ^ 31;
} }
static inline u32 ceil_log2(u32 value) static inline u32 ceil_log2(u32 value)
{ {
return value <= 1 ? 0 : utils::cntlz32((value - 1) << 1, true) ^ 31; return value <= 1 ? 0 : std::countl_zero((value - 1) << 1) ^ 31;
} }
static inline u32 next_pow2(u32 x) static inline u32 next_pow2(u32 x)
{ {
if (x <= 2) return x; if (x <= 2) return x;
return static_cast<u32>((1ULL << 32) >> utils::cntlz32(x - 1, true)); return static_cast<u32>((1ULL << 32) >> std::countl_zero(x - 1));
} }
static inline bool fcmp(float a, float b, float epsilon = 0.000001f) static inline bool fcmp(float a, float b, float epsilon = 0.000001f)

View File

@ -5,7 +5,6 @@
#endif #endif
#include "Utilities/sync.h" #include "Utilities/sync.h"
#include "Utilities/asm.h"
#ifdef USE_POSIX #ifdef USE_POSIX
#include <semaphore.h> #include <semaphore.h>
@ -141,7 +140,7 @@ static sync_var* slot_get(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
} }
// Get the number of leading equal bits to determine subslot // Get the number of leading equal bits to determine subslot
const u64 eq_bits = utils::cntlz64((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16, true); const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
// Proceed recursively, increment level // Proceed recursively, increment level
return slot_get(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1); return slot_get(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
@ -166,7 +165,7 @@ static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
} }
// Get the number of leading equal bits to determine subslot // Get the number of leading equal bits to determine subslot
const u64 eq_bits = utils::cntlz64((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16, true); const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
// Proceed recursively, to deallocate deepest branch first // Proceed recursively, to deallocate deepest branch first
slot_free(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1); slot_free(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
@ -445,7 +444,7 @@ void atomic_storage_futex::wait(const void* data, std::size_t size, u64 old_valu
} }
// Get the number of leading equal bits (between iptr and slot owner) // Get the number of leading equal bits (between iptr and slot owner)
const u64 eq_bits = utils::cntlz64((((iptr ^ ok) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16, true); const u64 eq_bits = std::countl_zero<u64>((((iptr ^ ok) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
// Collision; need to go deeper // Collision; need to go deeper
ptr = s_slot_list[(ok & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits; ptr = s_slot_list[(ok & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits;