mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-29 22:20:48 +00:00
atomic.cpp: various cleanups and fixes
Add pointer comparison to notifiers (to prevent spurious wakeups). Fix a bug with a possible double notification in raw_notify(). Fix a bug with incorrect allocatin bit slots for cond_handle. Add a semaphore counter to track max allowed number of threads. Use #define for some constants to STRINGIZE them in errors. Add some error messages when certain limits are reached. Fix a bug with a wrong check simply throwing std::abort. Use "special" notify_all patch with batch processing for every arch. Fix Win7 bug who no one probably noticed.
This commit is contained in:
parent
cc07e5306e
commit
1bb7c74c93
@ -80,6 +80,8 @@ ptr_cmp(const void* data, u32 size, __m128i old128, __m128i mask128)
|
|||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -138,7 +140,7 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
// Essentially a fat semaphore
|
// Essentially a fat semaphore
|
||||||
struct cond_handle
|
struct alignas(64) cond_handle
|
||||||
{
|
{
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
u64 tid = GetCurrentThreadId();
|
u64 tid = GetCurrentThreadId();
|
||||||
@ -147,6 +149,8 @@ namespace
|
|||||||
#endif
|
#endif
|
||||||
atomic_t<u32> sync{};
|
atomic_t<u32> sync{};
|
||||||
u32 size{};
|
u32 size{};
|
||||||
|
u64 tsc0{};
|
||||||
|
const void* ptr{};
|
||||||
__m128i mask{};
|
__m128i mask{};
|
||||||
__m128i oldv{};
|
__m128i oldv{};
|
||||||
|
|
||||||
@ -155,22 +159,46 @@ namespace
|
|||||||
std::condition_variable cond;
|
std::condition_variable cond;
|
||||||
std::mutex mtx;
|
std::mutex mtx;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
bool forced_wakeup()
|
||||||
|
{
|
||||||
|
const auto [_old, ok] = sync.fetch_op([](u32& val)
|
||||||
|
{
|
||||||
|
if (val == 1 || val == 2)
|
||||||
|
{
|
||||||
|
val = 3;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Prevent collision between normal wake-up and forced one
|
||||||
|
return ok && _old == 1;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifndef USE_STD
|
||||||
|
static_assert(sizeof(cond_handle) == 64);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Arbitrary max allowed thread number (to fit in 15 bits)
|
// Max allowed thread number is chosen to fit in 15 bits
|
||||||
static constexpr u32 s_max_conds = 512 * 64 - 1;
|
static std::aligned_storage_t<sizeof(cond_handle), alignof(cond_handle)> s_cond_list[INT16_MAX]{};
|
||||||
|
|
||||||
static std::aligned_storage_t<sizeof(cond_handle), alignof(cond_handle)> s_cond_list[s_max_conds]{};
|
// Used to allow concurrent notifying
|
||||||
|
static atomic_t<u16> s_cond_refs[INT16_MAX]{};
|
||||||
|
|
||||||
atomic_t<u64, 64> s_cond_bits[s_max_conds / 64];
|
// Allocation bits
|
||||||
|
static atomic_t<u64, 64> s_cond_bits[::align<u32>(INT16_MAX, 64) / 64]{};
|
||||||
|
|
||||||
atomic_t<u32, 64> s_cond_sema{0};
|
// Allocation semaphore
|
||||||
|
static atomic_t<u32, 64> s_cond_sema{0};
|
||||||
|
|
||||||
static u32 cond_alloc()
|
static u32 cond_alloc()
|
||||||
{
|
{
|
||||||
// Determine whether there is a free slot or not
|
// Determine whether there is a free slot or not
|
||||||
if (!s_cond_sema.try_inc(s_max_conds + 1))
|
if (!s_cond_sema.try_inc(INT16_MAX + 1))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -184,9 +212,9 @@ static u32 cond_alloc()
|
|||||||
const u32 start = __rdtsc();
|
const u32 start = __rdtsc();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (u32 i = start * 8;; i++)
|
for (u32 i = start;; i++)
|
||||||
{
|
{
|
||||||
const u32 group = i % (s_max_conds / 64);
|
const u32 group = i % ::size32(s_cond_bits);
|
||||||
|
|
||||||
const auto [bits, ok] = s_cond_bits[group].fetch_op([](u64& bits)
|
const auto [bits, ok] = s_cond_bits[group].fetch_op([](u64& bits)
|
||||||
{
|
{
|
||||||
@ -212,14 +240,14 @@ static u32 cond_alloc()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: unreachable
|
// Unreachable
|
||||||
std::abort();
|
std::abort();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static cond_handle* cond_get(u32 cond_id)
|
static cond_handle* cond_get(u32 cond_id)
|
||||||
{
|
{
|
||||||
if (cond_id - 1 < s_max_conds) [[likely]]
|
if (cond_id - 1 < u32{INT16_MAX}) [[likely]]
|
||||||
{
|
{
|
||||||
return std::launder(reinterpret_cast<cond_handle*>(s_cond_list + (cond_id - 1)));
|
return std::launder(reinterpret_cast<cond_handle*>(s_cond_list + (cond_id - 1)));
|
||||||
}
|
}
|
||||||
@ -229,7 +257,7 @@ static cond_handle* cond_get(u32 cond_id)
|
|||||||
|
|
||||||
static void cond_free(u32 cond_id)
|
static void cond_free(u32 cond_id)
|
||||||
{
|
{
|
||||||
if (cond_id - 1 >= s_max_conds)
|
if (cond_id - 1 >= u32{INT16_MAX})
|
||||||
{
|
{
|
||||||
fprintf(stderr, "cond_free(): bad id %u" HERE "\n", cond_id);
|
fprintf(stderr, "cond_free(): bad id %u" HERE "\n", cond_id);
|
||||||
std::abort();
|
std::abort();
|
||||||
@ -247,6 +275,8 @@ static void cond_free(u32 cond_id)
|
|||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
#define MAX_THREADS (56)
|
||||||
|
|
||||||
struct alignas(128) sync_var
|
struct alignas(128) sync_var
|
||||||
{
|
{
|
||||||
constexpr sync_var() noexcept = default;
|
constexpr sync_var() noexcept = default;
|
||||||
@ -254,17 +284,17 @@ namespace
|
|||||||
// Reference counter, owning pointer, collision bit and optionally selected slot
|
// Reference counter, owning pointer, collision bit and optionally selected slot
|
||||||
atomic_t<u64> addr_ref{};
|
atomic_t<u64> addr_ref{};
|
||||||
|
|
||||||
// Allocated semaphore bits (max 56, to make total size 128)
|
// Semaphores (allocated in reverse order), 0 means empty, 0x8000 is notifier lock
|
||||||
atomic_t<u64> sema_bits{};
|
atomic_t<u16> sema_data[MAX_THREADS]{};
|
||||||
|
|
||||||
// Semaphores (one per thread), data is platform-specific but 0 means empty
|
// Allocated semaphore bits (to make total size 128)
|
||||||
atomic_t<u16> sema_data[56]{};
|
atomic_t<u64> sema_bits{};
|
||||||
|
|
||||||
atomic_t<u16>* sema_alloc()
|
atomic_t<u16>* sema_alloc()
|
||||||
{
|
{
|
||||||
const auto [bits, ok] = sema_bits.fetch_op([](u64& bits)
|
const auto [bits, ok] = sema_bits.fetch_op([](u64& bits)
|
||||||
{
|
{
|
||||||
if (bits + 1 < (1ull << 56))
|
if (bits + 1 < (1ull << MAX_THREADS))
|
||||||
{
|
{
|
||||||
// Set lowest clear bit
|
// Set lowest clear bit
|
||||||
bits |= bits + 1;
|
bits |= bits + 1;
|
||||||
@ -280,6 +310,8 @@ namespace
|
|||||||
return &sema_data[std::countr_one(bits)];
|
return &sema_data[std::countr_one(bits)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: support extension if reached
|
||||||
|
fmt::raw_error("Thread limit " STRINGIZE(MAX_THREADS) " for a single address reached in atomic wait.");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -287,6 +319,7 @@ namespace
|
|||||||
{
|
{
|
||||||
if (sema < sema_data || sema >= std::end(sema_data))
|
if (sema < sema_data || sema >= std::end(sema_data))
|
||||||
{
|
{
|
||||||
|
fprintf(stderr, "sema_free(): bad sema ptr %p" HERE "\n", sema);
|
||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -307,6 +340,8 @@ namespace
|
|||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(sync_var) == 128);
|
static_assert(sizeof(sync_var) == 128);
|
||||||
|
|
||||||
|
#undef MAX_THREADS
|
||||||
}
|
}
|
||||||
|
|
||||||
// Main hashtable for atomic wait.
|
// Main hashtable for atomic wait.
|
||||||
@ -324,16 +359,28 @@ namespace
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Number of search groups (defines max slot branch count as gcount * 64)
|
// Number of search groups (defines max slot branch count as gcount * 64)
|
||||||
static constexpr u32 s_slot_gcount = (s_hashtable_power > 7 ? 4096 : 256) / 64;
|
#define MAX_SLOTS (4096)
|
||||||
|
|
||||||
// Array of slot branch objects
|
// Array of slot branch objects
|
||||||
alignas(128) static slot_info s_slot_list[s_slot_gcount * 64]{};
|
alignas(128) static slot_info s_slot_list[MAX_SLOTS]{};
|
||||||
|
|
||||||
// Allocation bits
|
// Allocation bits
|
||||||
static atomic_t<u64, 64> s_slot_bits[s_slot_gcount]{};
|
static atomic_t<u64, 64> s_slot_bits[MAX_SLOTS / 64]{};
|
||||||
|
|
||||||
|
// Allocation semaphore
|
||||||
|
static atomic_t<u32, 64> s_slot_sema{0};
|
||||||
|
|
||||||
|
static_assert(MAX_SLOTS % 64 == 0);
|
||||||
|
|
||||||
static u64 slot_alloc()
|
static u64 slot_alloc()
|
||||||
{
|
{
|
||||||
|
// Determine whether there is a free slot or not
|
||||||
|
if (!s_slot_sema.try_inc(MAX_SLOTS + 1))
|
||||||
|
{
|
||||||
|
fmt::raw_error("Hashtable extension slot limit " STRINGIZE(MAX_SLOTS) " reached in atomic wait.");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Diversify search start points to reduce contention and increase immediate success chance
|
// Diversify search start points to reduce contention and increase immediate success chance
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
const u32 start = GetCurrentProcessorNumber();
|
const u32 start = GetCurrentProcessorNumber();
|
||||||
@ -343,9 +390,9 @@ static u64 slot_alloc()
|
|||||||
const u32 start = __rdtsc();
|
const u32 start = __rdtsc();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (u32 i = 0;; i++)
|
for (u32 i = start;; i++)
|
||||||
{
|
{
|
||||||
const u32 group = (i + start * 8) % s_slot_gcount;
|
const u32 group = i % ::size32(s_slot_bits);
|
||||||
|
|
||||||
const auto [bits, ok] = s_slot_bits[group].fetch_op([](u64& bits)
|
const auto [bits, ok] = s_slot_bits[group].fetch_op([](u64& bits)
|
||||||
{
|
{
|
||||||
@ -366,11 +413,13 @@ static u64 slot_alloc()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: unreachable
|
// Unreachable
|
||||||
std::abort();
|
std::abort();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef MAX_SLOTS
|
||||||
|
|
||||||
static sync_var* slot_get(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
static sync_var* slot_get(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
||||||
{
|
{
|
||||||
if (!loc)
|
if (!loc)
|
||||||
@ -407,6 +456,9 @@ static void slot_free(u64 id)
|
|||||||
// Reset allocation bit
|
// Reset allocation bit
|
||||||
id = (id & s_slot_mask) / one_v<s_slot_mask>;
|
id = (id & s_slot_mask) / one_v<s_slot_mask>;
|
||||||
s_slot_bits[id / 64] &= ~(1ull << (id % 64));
|
s_slot_bits[id / 64] &= ~(1ull << (id % 64));
|
||||||
|
|
||||||
|
// Reset semaphore
|
||||||
|
s_slot_sema--;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
||||||
@ -415,10 +467,8 @@ static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
|||||||
|
|
||||||
if ((value & s_pointer_mask) != (iptr & s_pointer_mask))
|
if ((value & s_pointer_mask) != (iptr & s_pointer_mask))
|
||||||
{
|
{
|
||||||
if ((value & s_waiter_mask) == 0 || (value & s_collision_bit) == 0)
|
ASSERT(value & s_waiter_mask);
|
||||||
{
|
ASSERT(value & s_collision_bit);
|
||||||
std::abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the number of leading equal bits to determine subslot
|
// Get the number of leading equal bits to determine subslot
|
||||||
const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
|
const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
|
||||||
@ -430,7 +480,7 @@ static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
|||||||
// Actual cleanup in reverse order
|
// Actual cleanup in reverse order
|
||||||
auto [_old, ok] = loc->addr_ref.fetch_op([&](u64& value)
|
auto [_old, ok] = loc->addr_ref.fetch_op([&](u64& value)
|
||||||
{
|
{
|
||||||
if (value & s_waiter_mask)
|
ASSERT(value & s_waiter_mask);
|
||||||
{
|
{
|
||||||
value -= one_v<s_waiter_mask>;
|
value -= one_v<s_waiter_mask>;
|
||||||
|
|
||||||
@ -443,15 +493,10 @@ static void slot_free(std::uintptr_t iptr, sync_var* loc, u64 lv = 0)
|
|||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::abort();
|
|
||||||
});
|
});
|
||||||
|
|
||||||
if (ok > 1 && _old & s_collision_bit)
|
if (ok > 1 && _old & s_collision_bit)
|
||||||
{
|
{
|
||||||
if (loc->sema_bits)
|
|
||||||
std::abort();
|
|
||||||
|
|
||||||
// Deallocate slot on last waiter
|
// Deallocate slot on last waiter
|
||||||
slot_free(_old);
|
slot_free(_old);
|
||||||
}
|
}
|
||||||
@ -562,7 +607,7 @@ atomic_storage_futex::wait(const void* data, u32 size, __m128i old_value, u64 ti
|
|||||||
|
|
||||||
if (cond_id == 0)
|
if (cond_id == 0)
|
||||||
{
|
{
|
||||||
fmt::raw_error("Thread limit (32767) reached in atomic wait.");
|
fmt::raw_error("Thread limit " STRINGIZE(INT16_MAX) " reached in atomic wait.");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto sema = slot->sema_alloc();
|
auto sema = slot->sema_alloc();
|
||||||
@ -584,13 +629,15 @@ atomic_storage_futex::wait(const void* data, u32 size, __m128i old_value, u64 ti
|
|||||||
// Save for notifiers
|
// Save for notifiers
|
||||||
const auto cond = cond_get(cond_id);
|
const auto cond = cond_get(cond_id);
|
||||||
|
|
||||||
// Store some info for notifiers
|
// Store some info for notifiers (some may be unused)
|
||||||
cond->size = size;
|
cond->size = size;
|
||||||
cond->mask = mask;
|
cond->mask = mask;
|
||||||
cond->oldv = old_value;
|
cond->oldv = old_value;
|
||||||
|
cond->ptr = data;
|
||||||
|
cond->tsc0 = __rdtsc();
|
||||||
|
|
||||||
cond->sync = 1;
|
cond->sync = 1;
|
||||||
sema->release(cond_id);
|
sema->store(static_cast<u16>(cond_id));
|
||||||
|
|
||||||
#ifdef USE_STD
|
#ifdef USE_STD
|
||||||
// Lock mutex
|
// Lock mutex
|
||||||
@ -604,7 +651,7 @@ atomic_storage_futex::wait(const void* data, u32 size, __m128i old_value, u64 ti
|
|||||||
bool fallback = false;
|
bool fallback = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
while (ptr_cmp(data, size, old_value, mask) && cond->sync != 3)
|
while (ptr_cmp(data, size, old_value, mask))
|
||||||
{
|
{
|
||||||
#ifdef USE_FUTEX
|
#ifdef USE_FUTEX
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
@ -652,7 +699,7 @@ atomic_storage_futex::wait(const void* data, u32 size, __m128i old_value, u64 ti
|
|||||||
|
|
||||||
if (fallback) [[unlikely]]
|
if (fallback) [[unlikely]]
|
||||||
{
|
{
|
||||||
if (!cond->sync.compare_and_swap_test(2, 1))
|
if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
|
||||||
{
|
{
|
||||||
fallback = false;
|
fallback = false;
|
||||||
break;
|
break;
|
||||||
@ -751,15 +798,6 @@ alert_sema(atomic_t<u16>* sema, const void* data, u64 info, u32 size, __m128i ma
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dirty optimization: prevent attempting to lock dead or uninitialized sync vars
|
|
||||||
u32 sync_var = 0;
|
|
||||||
std::memcpy(&sync_var, reinterpret_cast<char*>(s_cond_list) + (sizeof(cond_handle) * (id - 1) + offsetof(cond_handle, sync)), sizeof(sync_var));
|
|
||||||
|
|
||||||
if (!sync_var)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set notify lock
|
// Set notify lock
|
||||||
id |= 0x8000;
|
id |= 0x8000;
|
||||||
return true;
|
return true;
|
||||||
@ -774,43 +812,30 @@ alert_sema(atomic_t<u16>* sema, const void* data, u64 info, u32 size, __m128i ma
|
|||||||
|
|
||||||
ok = false;
|
ok = false;
|
||||||
|
|
||||||
if (cond && cond->sync && (!size ? (!info || cond->tid == info) : cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv)))
|
if (cond && cond->sync && (!size ? (!info || cond->tid == info) : cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv)))
|
||||||
{
|
{
|
||||||
if ((!size && cond->sync.exchange(3) == 1) || (size && cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2)))
|
if ((!size && cond->forced_wakeup()) || (size && cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2)))
|
||||||
{
|
{
|
||||||
// Imminent notification
|
ok = true;
|
||||||
if (!size || !info)
|
|
||||||
{
|
|
||||||
s_tls_notify_cb(data, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_FUTEX
|
#ifdef USE_FUTEX
|
||||||
// Use "wake all" arg for robustness, only 1 thread is expected
|
// Use "wake all" arg for robustness, only 1 thread is expected
|
||||||
futex(&cond->sync, FUTEX_WAKE_PRIVATE, 0x7fff'ffff);
|
futex(&cond->sync, FUTEX_WAKE_PRIVATE, 0x7fff'ffff);
|
||||||
ok = true;
|
|
||||||
#elif defined(USE_STD)
|
#elif defined(USE_STD)
|
||||||
// Not super efficient: locking is required to avoid lost notifications
|
// Not super efficient: locking is required to avoid lost notifications
|
||||||
cond->mtx.lock();
|
cond->mtx.lock();
|
||||||
cond->mtx.unlock();
|
cond->mtx.unlock();
|
||||||
cond->cond.notify_all();
|
cond->cond.notify_all();
|
||||||
ok = true;
|
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
if (NtWaitForAlertByThreadId)
|
if (NtWaitForAlertByThreadId)
|
||||||
{
|
{
|
||||||
if (NtAlertThreadByThreadId(cond->tid) == NTSTATUS_SUCCESS)
|
// Sets some sticky alert bit, at least I believe so
|
||||||
{
|
NtAlertThreadByThreadId(cond->tid);
|
||||||
// Could be some dead thread otherwise
|
|
||||||
ok = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Can wait in rare cases, which is its annoying weakness
|
// Can wait in rare cases, which is its annoying weakness
|
||||||
if (NtReleaseKeyedEvent(nullptr, sema, 1, nullptr) == NTSTATUS_SUCCESS)
|
NtReleaseKeyedEvent(nullptr, sema, 1, nullptr);
|
||||||
{
|
|
||||||
// Can't fail
|
|
||||||
ok = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -866,6 +891,8 @@ bool atomic_storage_futex::raw_notify(const void* data, u64 thread_id)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s_tls_notify_cb(data, 0);
|
||||||
|
|
||||||
u64 progress = 0;
|
u64 progress = 0;
|
||||||
|
|
||||||
for (u64 bits = slot->sema_bits.load(); bits; bits &= bits - 1)
|
for (u64 bits = slot->sema_bits.load(); bits; bits &= bits - 1)
|
||||||
@ -906,6 +933,8 @@ atomic_storage_futex::notify_one(const void* data, u32 size, __m128i mask, __m12
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s_tls_notify_cb(data, 0);
|
||||||
|
|
||||||
u64 progress = 0;
|
u64 progress = 0;
|
||||||
|
|
||||||
for (u64 bits = slot->sema_bits; bits; bits &= bits - 1)
|
for (u64 bits = slot->sema_bits; bits; bits &= bits - 1)
|
||||||
@ -922,7 +951,7 @@ atomic_storage_futex::notify_one(const void* data, u32 size, __m128i mask, __m12
|
|||||||
s_tls_notify_cb(data, -1);
|
s_tls_notify_cb(data, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
SAFE_BUFFERS void
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
__vectorcall
|
__vectorcall
|
||||||
#endif
|
#endif
|
||||||
@ -937,18 +966,19 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s_tls_notify_cb(data, 0);
|
||||||
|
|
||||||
u64 progress = 0;
|
u64 progress = 0;
|
||||||
|
|
||||||
#if defined(_WIN32) && !defined(USE_FUTEX) && !defined(USE_STD)
|
#if defined(_WIN32) && !defined(USE_FUTEX)
|
||||||
// Special path for Windows 7
|
// Special path for Windows 7
|
||||||
if (!NtAlertThreadByThreadId)
|
if (!NtAlertThreadByThreadId)
|
||||||
|
#elif defined(USE_STD)
|
||||||
{
|
{
|
||||||
// Make a copy to filter out waiters that fail some checks
|
// Make a copy to filter out waiters that fail some checks
|
||||||
u64 copy = slot->sema_bits.load();
|
u64 copy = slot->sema_bits.load();
|
||||||
u64 lock = 0;
|
u64 lock = 0;
|
||||||
u32 lock_ids[56]{};
|
u32 lock_ids[64]{};
|
||||||
|
|
||||||
static LARGE_INTEGER instant{};
|
|
||||||
|
|
||||||
for (u64 bits = copy; bits; bits &= bits - 1)
|
for (u64 bits = copy; bits; bits &= bits - 1)
|
||||||
{
|
{
|
||||||
@ -963,15 +993,6 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 sync_var = 0;
|
|
||||||
std::memcpy(&sync_var, reinterpret_cast<char*>(s_cond_list) + (sizeof(cond_handle) * (id - 1) + offsetof(cond_handle, sync)), sizeof(sync_var));
|
|
||||||
|
|
||||||
if (!sync_var)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set notify lock
|
|
||||||
id |= 0x8000;
|
id |= 0x8000;
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
@ -984,15 +1005,10 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
|
|
||||||
const auto cond = cond_get(cond_id);
|
const auto cond = cond_get(cond_id);
|
||||||
|
|
||||||
if (cond && cond->sync && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv))
|
if (cond && cond->sync && cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv))
|
||||||
{
|
{
|
||||||
if (cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2))
|
if (cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2))
|
||||||
{
|
{
|
||||||
if (bits == copy)
|
|
||||||
{
|
|
||||||
s_tls_notify_cb(data, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1011,11 +1027,26 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
|
|
||||||
const auto sema = &slot->sema_data[id];
|
const auto sema = &slot->sema_data[id];
|
||||||
|
|
||||||
|
#ifdef USE_STD
|
||||||
|
const auto cond = cond_get(lock_ids[id]);
|
||||||
|
|
||||||
|
// Optimistic non-blocking path
|
||||||
|
if (cond->mtx.try_lock())
|
||||||
|
{
|
||||||
|
cond->mtx.unlock();
|
||||||
|
cond->cond.notify_all();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#elif defined(_WIN32)
|
||||||
if (NtReleaseKeyedEvent(nullptr, sema, 1, &instant) != NTSTATUS_SUCCESS)
|
if (NtReleaseKeyedEvent(nullptr, sema, 1, &instant) != NTSTATUS_SUCCESS)
|
||||||
{
|
{
|
||||||
// Failed to notify immediately
|
// Failed to notify immediately
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
s_tls_notify_cb(data, ++progress);
|
s_tls_notify_cb(data, ++progress);
|
||||||
|
|
||||||
@ -1027,7 +1058,14 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
// Proceed with remaining bits using "normal" blocking waiting
|
// Proceed with remaining bits using "normal" blocking waiting
|
||||||
for (u64 bits = copy; bits; bits &= bits - 1)
|
for (u64 bits = copy; bits; bits &= bits - 1)
|
||||||
{
|
{
|
||||||
NtReleaseKeyedEvent(nullptr, &slot->sema_data[std::countr_zero(bits)], 1, nullptr);
|
#ifdef USE_STD
|
||||||
|
const auto cond = cond_get(lock_ids[std::countr_zero(bits)]);
|
||||||
|
cond->mtx.lock();
|
||||||
|
cond->mtx.unlock();
|
||||||
|
cond->cond.notify_all();
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
NtReleaseKeyedEvent(nullptr, sema, 1, nullptr);
|
||||||
|
#endif
|
||||||
s_tls_notify_cb(data, ++progress);
|
s_tls_notify_cb(data, ++progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1040,8 +1078,6 @@ atomic_storage_futex::notify_all(const void* data, u32 size, __m128i mask, __m12
|
|||||||
|
|
||||||
if (sema->fetch_and(0x7fff) == 0x8000)
|
if (sema->fetch_and(0x7fff) == 0x8000)
|
||||||
{
|
{
|
||||||
const u32 id = std::countr_zero(bits);
|
|
||||||
|
|
||||||
cond_free(lock_ids[id]);
|
cond_free(lock_ids[id]);
|
||||||
|
|
||||||
slot->sema_bits &= ~(1ull << id);
|
slot->sema_bits &= ~(1ull << id);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user