diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index 24d0885042..c85503f5ff 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -23,7 +23,7 @@ #include "endian.hpp" // Total number of entries, should be a power of 2. -static constexpr std::size_t s_hashtable_size = 1u << 18; +static constexpr std::size_t s_hashtable_size = 1u << 17; // Reference counter combined with shifted pointer (which is assumed to be 47 bit) static constexpr std::uintptr_t s_ref_mask = (1u << 17) - 1; @@ -696,12 +696,25 @@ cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, std::uintptr_t iptr = return nullptr; } +namespace +{ + struct alignas(16) slot_allocator + { + u64 ref : 16 = 0; + u64 low : 48 = 0; + u64 high = 0; + + constexpr slot_allocator() noexcept = default; + }; +} + namespace atomic_wait { - // Need to spare 16 bits for max distance - static constexpr u64 max_threads = 48; + // Need to spare 16 bits for ref counter + static constexpr u64 max_threads = 112; - static constexpr u64 thread_mask = (1ull << max_threads) - 1; + // Can only allow extended allocations go as far as this (about 585) + static constexpr u64 max_distance = UINT16_MAX / max_threads; // Thread list struct alignas(64) root_info @@ -709,7 +722,7 @@ namespace atomic_wait constexpr root_info() noexcept = default; // Allocation bits (least significant) - atomic_t bits{}; + atomic_t bits{}; // Allocation pool, pointers to allocated semaphores atomic_t slots[max_threads]{}; @@ -720,9 +733,6 @@ namespace atomic_wait // For collision statistics (bit difference stick flags) atomic_t diff_lz{}, diff_tz{}, diff_pop{}; - // Total reference counter - atomic_t threads{}; - atomic_t* slot_alloc(std::uintptr_t ptr) noexcept; root_info* slot_free(atomic_t* slot) noexcept; @@ -731,7 +741,7 @@ namespace atomic_wait auto slot_search(std::uintptr_t iptr, u64 thread_id, __m128i mask, F func) noexcept; }; - static_assert(sizeof(root_info) == 128); + static_assert(sizeof(root_info) == 256); } // Main hashtable for atomic wait. @@ -758,68 +768,81 @@ u64 atomic_wait::get_unique_tsc() atomic_t* atomic_wait::root_info::slot_alloc(std::uintptr_t ptr) noexcept { - if (!threads.try_inc(UINT16_MAX + 1)) + auto slot = bits.atomic_op([this](slot_allocator& bits) -> atomic_t* { - fmt::raw_error("Thread limit " STRINGIZE(UINT16_MAX) " reached in a single hashtable slot."); - return nullptr; - } - - auto* _this = this; - - u64 limit = 0; - - while (true) - { - const auto [_bits, ok] = _this->bits.fetch_op([](u64& bits) + // Increment reference counter + if (bits.ref == UINT16_MAX) { - // Check free slot - if (~bits & thread_mask) + fmt::raw_error("Thread limit " STRINGIZE(UINT16_MAX) " reached for a single hashtable slot."); + return nullptr; + } + + bits.ref++; + + // Check free slots + if (~bits.high) + { + // Set lowest clear bit + const u32 id = std::countr_one(bits.high); + bits.high |= bits.high + 1; + return this->slots + id; + } + + if (~bits.low << 16) + { + const u32 id = std::countr_one(bits.low); + bits.low |= bits.low + 1; + return this->slots + 64 + id; + } + + return nullptr; + }); + + u32 limit = 0; + + for (auto* _this = this + 1; !slot;) + { + auto [_old, slot2] = _this->bits.fetch_op([_this](slot_allocator& bits) -> atomic_t* + { + if (~bits.high) { - // Set lowest clear bit - bits |= bits + 1; - return true; + const u32 id = std::countr_one(bits.high); + bits.high |= bits.high + 1; + return _this->slots + id; } - return false; + if (~bits.low << 16) + { + const u32 id = std::countr_one(bits.low); + bits.low |= bits.low + 1; + return _this->slots + 64 + id; + } + + return nullptr; }); - if (ok) + if (slot2) { - const u32 slot_n = std::countr_one(_bits); - { - const u16 v = _this->slots[slot_n].load(); - } - - return &_this->slots[slot_n]; + slot = slot2; + break; } // Keep trying adjacent slots in the hashtable, they are often free due to alignment. _this++; limit++; + if (limit == max_distance) [[unlikely]] + { + fmt::raw_error("Distance limit (585) exceeded for the atomic wait hashtable."); + return nullptr; + } + if (_this == std::end(s_hashtable)) [[unlikely]] { _this = s_hashtable; } } - if (limit) - { - // Make slot "extended" - bits.fetch_op([&](u64& val) - { - if ((val >> max_threads) >= limit) [[likely]] - { - return false; - } - - // Replace with max value - val &= thread_mask; - val |= limit << max_threads; - return true; - }); - } - u32 ptr32 = static_cast(ptr >> 16); u32 first = first_ptr.load(); @@ -856,6 +879,8 @@ atomic_t* atomic_wait::root_info::slot_alloc(std::uintptr_t ptr) noexcept diff_pop |= 1u << static_cast((diff >> 16) + diff - 1); } + + return slot; } atomic_wait::root_info* atomic_wait::root_info::slot_free(atomic_t* slot) noexcept @@ -880,7 +905,9 @@ atomic_wait::root_info* atomic_wait::root_info::slot_free(atomic_t* slot) n return nullptr; } - verify(HERE), slot == &_this->slots[slot - _this->slots]; + const u32 diff = static_cast(slot - _this->slots); + + verify(HERE), slot == &_this->slots[diff]; const u32 cond_id = slot->exchange(0); @@ -889,68 +916,84 @@ atomic_wait::root_info* atomic_wait::root_info::slot_free(atomic_t* slot) n cond_free(cond_id); } - _this->bits &= ~(1ull << (slot - _this->slots)); - - auto cnt = this->threads--; - - verify(HERE), cnt; - - if (cnt > 1) + if (_this != this) { - return _this; - } - - // Only the last waiter does opportunistic cleanup attempt - while (this->threads < max_threads) - { - auto [old, ok] = this->bits.fetch_op([this](u64& val) + // Reset allocation bit in the adjacent hashtable slot + _this->bits.atomic_op([diff](slot_allocator& bits) { - if (!val || !(~val & thread_mask) || this->threads >= max_threads) + if (diff < 64) { - return false; + bits.high &= ~(1ull << diff); + } + else + { + bits.low &= ~(1ull << (diff - 64)); } - - // Try to clean distance mask - val &= thread_mask; - return true; }); - - if (!old || ok) - { - break; - } } + // Reset reference counter + bits.atomic_op([&](slot_allocator& bits) + { + verify(HERE), bits.ref--; + + if (_this == this) + { + if (diff < 64) + { + bits.high &= ~(1ull << diff); + } + else + { + bits.low &= ~(1ull << (diff - 64)); + } + } + }); + return _this; } template FORCE_INLINE auto atomic_wait::root_info::slot_search(std::uintptr_t iptr, u64 thread_id, __m128i mask, F func) noexcept { - const u64 bits_val = this->bits.load(); - const u64 max_order = bits_val >> max_threads; + u32 index = 0; + u32 total = 0; + u64 limit = 0; - auto* _this = this; - - u32 order = 0; - - u64 new_val = bits_val & thread_mask; - - while (new_val) + for (auto* _this = this;;) { - u32 cond_ids[max_threads]; - u32 cond_max = 0; + const auto bits = _this->bits.load(); - for (u64 bits = new_val; bits; bits &= bits - 1) + u16 cond_ids[max_threads]; + u32 cond_count = 0; + + u64 high_val = bits.high; + u64 low_val = bits.low; + + if (_this == this) { - if (const u32 cond_id = _this->slots[std::countr_zero(bits)]) + limit = bits.ref; + } + + for (u64 bits = high_val; bits; bits &= bits - 1) + { + if (u16 cond_id = _this->slots[std::countr_zero(bits)]) { utils::prefetch_read(s_cond_list + cond_id); - cond_ids[cond_max++] = cond_id; + cond_ids[cond_count++] = cond_id; } } - for (u32 i = 0; i < cond_max; i++) + for (u64 bits = low_val; bits; bits &= bits - 1) + { + if (u16 cond_id = _this->slots[std::countr_zero(bits)]) + { + utils::prefetch_read(s_cond_list + cond_id); + cond_ids[cond_count++] = cond_id; + } + } + + for (u32 i = 0; i < cond_count; i++) { if (cond_id_lock(cond_ids[i], mask, thread_id, iptr)) { @@ -961,10 +1004,17 @@ FORCE_INLINE auto atomic_wait::root_info::slot_search(std::uintptr_t iptr, u64 t } } - _this++; - order++; + total += cond_count; - if (order >= max_order) + if (total >= limit) + { + return; + } + + _this++; + index++; + + if (index == max_distance) { return; } @@ -973,8 +1023,6 @@ FORCE_INLINE auto atomic_wait::root_info::slot_search(std::uintptr_t iptr, u64 t { _this = s_hashtable; } - - new_val = _this->bits.load() & thread_mask; } }