[libc-commits] [libc] [libc] Perform bitfield zero initialization wave-parallel (PR #143607)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Wed Jun 11 06:15:52 PDT 2025
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/143607
>From 29119bd8572925a2e8d7b001b2910e7aad167df3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 10 Jun 2025 16:31:57 -0500
Subject: [PATCH] [libc] Perform bitfield zero initialization wave-parallel
Summary:
We need to set the bitfield memory to zero because the system does not
guarantee zeroed out memory. Even if fresh pages are zero, the system
allows re-use so we would need a `kfd` level API to skip this step.
Because we can't this patch updates the logic to perform the zero
initialization wave-parallel. This reduces the amount of time it takes
to allocate a fresh by up to a tenth.
This has the unfortunate side effect that the control flow is more
convoluted and we waste some extra registers, but it's worth it to
reduce the slab allocation latency.
---
libc/src/__support/GPU/allocator.cpp | 72 ++++++++++++++++++----------
1 file changed, 46 insertions(+), 26 deletions(-)
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 135ced3df704c..679a4309529f2 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -157,10 +157,19 @@ struct Slab {
Header *header = reinterpret_cast<Header *>(memory);
header->chunk_size = chunk_size;
header->global_index = global_index;
+ }
- // This memset is expensive and likely not necessary for the current 'kfd'
- // driver. Until zeroed pages are exposed by the API we must be careful.
- __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+ // Set the necessary bitfield bytes to zero in parallel using many lanes. This
+ // must be called before the bitfield can be accessed safely, memory is not
+ // guaranteed to be zero initialized in the current implementation.
+ void initialize(uint64_t uniform) {
+ uint64_t mask = gpu::get_lane_mask();
+ uint32_t *bitfield = get_bitfield();
+ uint32_t workers = cpp::popcount(uniform);
+ uint32_t words = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
+ sizeof(uint32_t);
+ for (uint32_t i = impl::lane_count(mask & uniform); i < words; i += workers)
+ bitfield[i] = 0;
}
// Get the number of chunks that can theoretically fit inside this slab.
@@ -283,7 +292,7 @@ struct Slab {
/// A wait-free guard around a pointer resource to be created dynamically if
/// space is available and freed once there are no more users.
-template <typename T> struct GuardPtr {
+struct GuardPtr {
private:
struct RefCounter {
// Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,32 +348,25 @@ template <typename T> struct GuardPtr {
cpp::Atomic<uint64_t> counter{0};
};
- cpp::Atomic<T *> ptr{nullptr};
+ cpp::Atomic<Slab *> ptr{nullptr};
RefCounter ref{};
// Should be called be a single lane for each different pointer.
template <typename... Args>
- T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
- T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
+ Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+ Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
if (!expected &&
- ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
- cpp::MemoryOrder::RELAXED,
- cpp::MemoryOrder::RELAXED)) {
+ ptr.compare_exchange_strong(
+ expected, reinterpret_cast<Slab *>(SENTINEL),
+ cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
count = cpp::numeric_limits<uint64_t>::max();
- void *raw = impl::rpc_allocate(sizeof(T));
+ void *raw = impl::rpc_allocate(sizeof(Slab));
if (!raw)
return nullptr;
- T *mem = new (raw) T(cpp::forward<Args>(args)...);
-
- cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
- ptr.store(mem, cpp::MemoryOrder::RELAXED);
- cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
- if (!ref.acquire(n, count))
- ref.reset(n, count);
- return mem;
+ return new (raw) Slab(cpp::forward<Args>(args)...);
}
- if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
+ if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
return nullptr;
if (!ref.acquire(n, count))
@@ -374,15 +376,25 @@ template <typename T> struct GuardPtr {
return ptr.load(cpp::MemoryOrder::RELAXED);
}
+ // Finalize the associated memory and signal that it is ready to use by
+ // resetting the counter.
+ void finalize(Slab *mem, uint32_t n, uint64_t &count) {
+ cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+ ptr.store(mem, cpp::MemoryOrder::RELAXED);
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+ if (!ref.acquire(n, count))
+ ref.reset(n, count);
+ }
+
public:
// Attempt to lock access to the pointer, potentially creating it if empty.
// The uniform mask represents which lanes share the same pointer. For each
// uniform value we elect a leader to handle it on behalf of the other lanes.
template <typename... Args>
- T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
- Args &&...args) {
+ Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
+ Args &&...args) {
count = 0;
- T *result = nullptr;
+ Slab *result = nullptr;
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
result = try_lock_impl(cpp::popcount(uniform), count,
cpp::forward<Args>(args)...);
@@ -392,6 +404,14 @@ template <typename T> struct GuardPtr {
if (!result)
return nullptr;
+ // We defer storing the newly allocated slab until now so that we can use
+ // multiple lanes to initialize it and release it for use.
+ if (count == cpp::numeric_limits<uint64_t>::max()) {
+ result->initialize(uniform);
+ if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+ finalize(result, cpp::popcount(uniform), count);
+ }
+
if (count != cpp::numeric_limits<uint64_t>::max())
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
@@ -403,8 +423,8 @@ template <typename T> struct GuardPtr {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
ref.release(cpp::popcount(mask))) {
- T *p = ptr.load(cpp::MemoryOrder::RELAXED);
- p->~T();
+ Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
+ p->~Slab();
impl::rpc_free(p);
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
@@ -417,7 +437,7 @@ template <typename T> struct GuardPtr {
};
// The global array used to search for a valid slab to allocate from.
-static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
+static GuardPtr slots[ARRAY_SIZE] = {};
// Tries to find a slab in the table that can support the given chunk size.
static Slab *find_slab(uint32_t chunk_size) {
More information about the libc-commits
mailing list