[libc-commits] [libc] [libc] Perform bitfield zero initialization wave-parallel (PR #143607)

Wed Jun 11 15:56:16 PDT 2025

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/143607

>From 26fb1aad8aba3f53a20bd407d93115d2ff8c2a2a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 10 Jun 2025 16:31:57 -0500
Subject: [PATCH 1/2] [libc] Perform bitfield zero initialization wave-parallel

Summary:
We need to set the bitfield memory to zero because the system does not
guarantee zeroed out memory. Even if fresh pages are zero, the system
allows re-use so we would need a `kfd` level API to skip this step.

Because we can't this patch updates the logic to perform the zero
initialization wave-parallel. This reduces the amount of time it takes
to allocate a fresh by up to a tenth.

This has the unfortunate side effect that the control flow is more
convoluted and we waste some extra registers, but it's worth it to
reduce the slab allocation latency.
---
 libc/src/__support/GPU/allocator.cpp | 42 ++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ecc0de1cb6ec3..679a4309529f2 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -157,10 +157,19 @@ struct Slab {
     Header *header = reinterpret_cast<Header *>(memory);
     header->chunk_size = chunk_size;
     header->global_index = global_index;
+  }
 
-    // This memset is expensive and likely not necessary for the current 'kfd'
-    // driver. Until zeroed pages are exposed by the API we must be careful.
-    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  // Set the necessary bitfield bytes to zero in parallel using many lanes. This
+  // must be called before the bitfield can be accessed safely, memory is not
+  // guaranteed to be zero initialized in the current implementation.
+  void initialize(uint64_t uniform) {
+    uint64_t mask = gpu::get_lane_mask();
+    uint32_t *bitfield = get_bitfield();
+    uint32_t workers = cpp::popcount(uniform);
+    uint32_t words = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
+                     sizeof(uint32_t);
+    for (uint32_t i = impl::lane_count(mask & uniform); i < words; i += workers)
+      bitfield[i] = 0;
   }
 
   // Get the number of chunks that can theoretically fit inside this slab.
@@ -354,14 +363,7 @@ struct GuardPtr {
       void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
-      Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
-
-      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-      ptr.store(mem, cpp::MemoryOrder::RELAXED);
-      cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
-      if (!ref.acquire(n, count))
-        ref.reset(n, count);
-      return mem;
+      return new (raw) Slab(cpp::forward<Args>(args)...);
     }
 
     if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
@@ -374,6 +376,16 @@ struct GuardPtr {
     return ptr.load(cpp::MemoryOrder::RELAXED);
   }
 
+  // Finalize the associated memory and signal that it is ready to use by
+  // resetting the counter.
+  void finalize(Slab *mem, uint32_t n, uint64_t &count) {
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    ptr.store(mem, cpp::MemoryOrder::RELAXED);
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    if (!ref.acquire(n, count))
+      ref.reset(n, count);
+  }
+
 public:
   // Attempt to lock access to the pointer, potentially creating it if empty.
   // The uniform mask represents which lanes share the same pointer. For each
@@ -392,6 +404,14 @@ struct GuardPtr {
     if (!result)
       return nullptr;
 
+    // We defer storing the newly allocated slab until now so that we can use
+    // multiple lanes to initialize it and release it for use.
+    if (count == cpp::numeric_limits<uint64_t>::max()) {
+      result->initialize(uniform);
+      if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+        finalize(result, cpp::popcount(uniform), count);
+    }
+
     if (count != cpp::numeric_limits<uint64_t>::max())
       count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
 

>From ef1b896d9bcff7e0010dd81936cf94236ec1eab4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 11 Jun 2025 17:56:06 -0500
Subject: [PATCH 2/2] Matt comments

---
 libc/src/__support/GPU/allocator.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 679a4309529f2..66ab155e5c299 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
   return (x + N) & ~(N - 1);
 }
 
+// Perform a lane parallel memset on a uint32_t pointer.
+void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t workers = cpp::popcount(uniform);
+  for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
+    s[i] = c;
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -163,13 +171,9 @@ struct Slab {
   // must be called before the bitfield can be accessed safely, memory is not
   // guaranteed to be zero initialized in the current implementation.
   void initialize(uint64_t uniform) {
-    uint64_t mask = gpu::get_lane_mask();
-    uint32_t *bitfield = get_bitfield();
-    uint32_t workers = cpp::popcount(uniform);
-    uint32_t words = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
-                     sizeof(uint32_t);
-    for (uint32_t i = impl::lane_count(mask & uniform); i < words; i += workers)
-      bitfield[i] = 0;
+    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
+                    sizeof(uint32_t);
+    impl::uniform_memset(get_bitfield(), 0, size, uniform);
   }
 
   // Get the number of chunks that can theoretically fit inside this slab.