atomic.cpp: shrink and simplify main hashtable

Reduce collision detection to 1 or 0 for now.
I think it should be offloaded to notifiers.
This commit is contained in:
Nekotekina 2020-11-26 07:35:25 +03:00
parent 8b6d615aa6
commit bd90e3e37f
2 changed files with 41 additions and 97 deletions

View File

@ -89,7 +89,7 @@ namespace
namespace atomic_wait
{
extern void parse_hashtable(bool(*cb)(u64 id, u16 refs, u32 ptr, u32 stats));
extern void parse_hashtable(bool(*cb)(u64 id, u32 refs, u64 ptr, u32 stats));
}
template<>
@ -1919,19 +1919,13 @@ void Emulator::Stop(bool restart)
aw_colc = 0;
aw_used = 0;
atomic_wait::parse_hashtable([](u64 id, u16 refs, u32 ptr, u32 stats) -> bool
atomic_wait::parse_hashtable([](u64 id, u32 refs, u64 ptr, u32 maxc) -> bool
{
aw_refs += refs;
aw_refs += refs != 0;
aw_used += ptr != 0;
stats = (stats & 0xaaaaaaaa) / 2 + (stats & 0x55555555);
stats = (stats & 0xcccccccc) / 4 + (stats & 0x33333333);
stats = (stats & 0xf0f0f0f0) / 16 + (stats & 0xf0f0f0f);
stats = (stats & 0xff00ff00) / 256 + (stats & 0xff00ff);
stats = (stats >> 16) + (stats & 0xffff);
aw_colm = std::max<u64>(aw_colm, stats);
aw_colc += stats != 0;
aw_colm = std::max<u64>(aw_colm, maxc);
aw_colc += maxc != 0;
return false;
});

View File

@ -21,7 +21,7 @@
#include "endian.hpp"
// Total number of entries, should be a power of 2.
static constexpr std::size_t s_hashtable_size = 1u << 17;
static constexpr std::size_t s_hashtable_size = 1u << 16;
// Reference counter combined with shifted pointer (which is assumed to be 47 bit)
static constexpr std::uintptr_t s_ref_mask = (1u << 17) - 1;
@ -778,16 +778,19 @@ namespace
{
struct alignas(16) slot_allocator
{
u64 ref : 16;
u64 low : 48;
u64 high;
u64 ref : 16; // Ref counter
u64 bits: 24; // Allocated bits
u64 prio: 24; // Reserved
u64 maxc: 17; // Collision counter
u64 iptr: 47; // First pointer to use slot (to count used slots)
};
// Need to spare 16 bits for ref counter
static constexpr u64 max_threads = 112;
static constexpr u64 max_threads = 24;
// (Arbitrary, not justified) Can only allow extended allocations go as far as this (about 585)
static constexpr u64 max_distance = UINT16_MAX / max_threads;
// (Arbitrary, not justified) Can only allow extended allocations go as far as this
static constexpr u64 max_distance = 500;
// Thread list
struct alignas(64) root_info
@ -798,12 +801,6 @@ namespace
// Allocation pool, pointers to allocated semaphores
atomic_t<u16> slots[max_threads];
// For collision statistics (32 middle bits)
atomic_t<u32> first_ptr;
// For collision statistics (bit difference stick flags)
atomic_t<u32> diff_lz, diff_tz, diff_pop;
static atomic_t<u16>* slot_alloc(std::uintptr_t ptr) noexcept;
static void slot_free(std::uintptr_t ptr, atomic_t<u16>* slot, u32 tls_slot) noexcept;
@ -811,10 +808,11 @@ namespace
template <typename F>
static auto slot_search(std::uintptr_t iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept;
void register_collisions(std::uintptr_t ptr);
// Somehow update information about collisions (TODO)
void register_collisions(std::uintptr_t ptr, u64 max_coll);
};
static_assert(sizeof(root_info) == 256);
static_assert(sizeof(root_info) == 64);
}
// Main hashtable for atomic wait.
@ -887,27 +885,23 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
return nullptr;
}
if (bits.iptr == 0)
bits.iptr = ptr;
if (bits.maxc == 0 && bits.iptr != ptr && bits.ref)
bits.maxc = 1;
bits.ref++;
if (~bits.high)
if (bits.bits != (1ull << max_threads) - 1)
{
const u32 id = std::countr_one(bits.high);
bits.high |= bits.high + 1;
const u32 id = std::countr_one(bits.bits);
bits.bits |= bits.bits + 1;
return _this->slots + id;
}
if (~bits.low << 16)
{
const u32 id = std::countr_one(bits.low);
bits.low |= bits.low + 1;
return _this->slots + 64 + id;
}
return nullptr;
});
_this->register_collisions(ptr);
if (slot)
{
break;
@ -918,7 +912,7 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
if (limit == max_distance) [[unlikely]]
{
fmt::raw_error("Distance limit (585) exceeded for the atomic wait hashtable.");
fmt::raw_error("Distance limit (500) exceeded for the atomic wait hashtable.");
return nullptr;
}
}
@ -926,44 +920,17 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
return slot;
}
void root_info::register_collisions(std::uintptr_t ptr)
void root_info::register_collisions(std::uintptr_t ptr, u64 max_coll)
{
u32 ptr32 = static_cast<u32>(ptr >> 16);
u32 first = first_ptr.load();
if (!first && first != ptr32)
bits.atomic_op([&](slot_allocator& bits)
{
// Register first used pointer
first = first_ptr.compare_and_swap(0, ptr32);
}
if (first && first != ptr32)
{
// Difference bits between pointers
u32 diff = first ^ ptr32;
// The most significant different bit
u32 diff1 = std::countl_zero(diff);
if (diff1 < 32)
{
diff_lz |= 1u << diff1;
}
u32 diff2 = std::countr_zero(diff);
if (diff2 < 32)
{
diff_tz |= 1u << diff2;
}
diff = (diff & 0xaaaaaaaa) / 2 + (diff & 0x55555555);
diff = (diff & 0xcccccccc) / 4 + (diff & 0x33333333);
diff = (diff & 0xf0f0f0f0) / 16 + (diff & 0x0f0f0f0f);
diff = (diff & 0xff00ff00) / 256 + (diff & 0x00ff00ff);
diff_pop |= 1u << static_cast<u8>((diff >> 16) + diff - 1);
}
if (bits.iptr == 0)
bits.iptr = ptr;
if (bits.maxc == 0 && bits.iptr != ptr)
bits.maxc = 1;
if (bits.maxc < max_coll)
bits.maxc = max_coll;
});
}
void root_info::slot_free(std::uintptr_t iptr, atomic_t<u16>* slot, u32 tls_slot) noexcept
@ -1008,14 +975,7 @@ void root_info::slot_free(std::uintptr_t iptr, atomic_t<u16>* slot, u32 tls_slot
if (_this == curr.current)
{
if (diff < 64)
{
bits.high &= ~(1ull << diff);
}
else
{
bits.low &= ~(1ull << (diff - 64));
}
bits.bits &= ~(1ull << diff);
}
});
@ -1044,19 +1004,9 @@ FORCE_INLINE auto root_info::slot_search(std::uintptr_t iptr, u32 size, u64 thre
u16 cond_ids[max_threads];
u32 cond_count = 0;
u64 high_val = bits.high;
u64 low_val = bits.low;
u64 bits_val = bits.bits;
for (u64 bits = high_val; bits; bits &= bits - 1)
{
if (u16 cond_id = _this->slots[std::countr_zero(bits)])
{
utils::prefetch_read(s_cond_list + cond_id);
cond_ids[cond_count++] = cond_id;
}
}
for (u64 bits = low_val; bits; bits &= bits - 1)
for (u64 bits = bits_val; bits; bits &= bits - 1)
{
if (u16 cond_id = _this->slots[std::countr_zero(bits)])
{
@ -1651,14 +1601,14 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask)
namespace atomic_wait
{
extern void parse_hashtable(bool(*cb)(u64 id, u16 refs, u32 ptr, u32 stats))
extern void parse_hashtable(bool(*cb)(u64 id, u32 refs, u64 ptr, u32 max_coll))
{
for (u64 i = 0; i < s_hashtable_size; i++)
{
const auto root = &s_hashtable[i];
const auto slot = root->bits.load();
if (cb(i, static_cast<u16>(slot.ref), root->first_ptr.load(), root->diff_lz | root->diff_tz | root->diff_pop))
if (cb(i, static_cast<u32>(slot.ref), slot.iptr, static_cast<u32>(slot.maxc)))
{
break;
}