[libc-commits] [libc] 6cc42b3 - [libc] Various GPU allocator tweaks and optimizations (#184368)

Tue Mar 3 07:59:23 PST 2026

Author: Joseph Huber
Date: 2026-03-03T09:59:02-06:00
New Revision: 6cc42b39556d33a968a899fd3243bcb707ae7169

URL: https://github.com/llvm/llvm-project/commit/6cc42b39556d33a968a899fd3243bcb707ae7169
DIFF: https://github.com/llvm/llvm-project/commit/6cc42b39556d33a968a899fd3243bcb707ae7169.diff

LOG: [libc] Various GPU allocator tweaks and optimizations (#184368)

Summary:
Some low-hanging fruit tweaks. Mostly preventing redundant loads and
unnecessary widening. Some fixes as well, like nullptr handling,
incorrect rounding, and oversized bitfields.

Added: 
    

Modified: 
    libc/src/__support/GPU/allocator.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 02fdcd9759ffe..24f98f1b8d08d 100644

--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -124,15 +124,9 @@ static inline constexpr uint32_t get_chunk_id(uint32_t x) {
   return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
 }
 
-// Rounds to the nearest power of two.
-template <uint32_t N, typename T>
-static inline constexpr T round_up(const T x) {
-  static_assert(((N - 1) & N) == 0, "N must be a power of two");
-  return (x + N) & ~(N - 1);
-}
-
 // Perform a lane parallel memset on a uint32_t pointer.
-void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t lane_mask) {
+static inline void uniform_memset(uint32_t *s, uint32_t c, uint32_t n,
+                                  uint64_t lane_mask) {
   uint32_t workers = cpp::popcount(lane_mask);
   for (uint32_t i = impl::lane_count(lane_mask, gpu::get_lane_id()); i < n;
        i += workers)
@@ -223,9 +217,8 @@ struct Slab {
     if (get_cached_chunk_size() <= get_chunk_size())
       return;
 
-    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
-                    sizeof(uint32_t);
-    impl::uniform_memset(get_bitfield(), 0, size, lane_mask);
+    impl::uniform_memset(get_bitfield(), 0, bitfield_words(get_chunk_size()),
+                         lane_mask);
   }
 
   // Get the number of chunks that can theoretically fit inside this slab.
@@ -233,11 +226,16 @@ struct Slab {
     return SLAB_SIZE / chunk_size;
   }
 
-  // Get the number of bytes needed to contain the bitfield bits.
+  // Get the number of uint32_t words needed for the bitfield.
+  constexpr static uint32_t bitfield_words(uint32_t chunk_size) {
+    return (num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD;
+  }
+
+  // Get the number of bytes reserved for the bitfield region with padding.
   constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
-    return __builtin_align_up(
-        ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8,
-        MIN_ALIGNMENT + 1);
+    return __builtin_align_up(bitfield_words(chunk_size) *
+                                  uint32_t(sizeof(uint32_t)),
+                              __GCC_DESTRUCTIVE_SIZE << 1);
   }
 
   // The actual amount of memory available excluding the bitfield and metadata.
@@ -245,11 +243,6 @@ struct Slab {
     return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
   }
 
-  // The number of chunks that can be stored in this slab.
-  constexpr static uint32_t available_chunks(uint32_t chunk_size) {
-    return available_bytes(chunk_size) / chunk_size;
-  }
-
   // The length in bits of the bitfield.
   constexpr static uint32_t usable_bits(uint32_t chunk_size) {
     return available_bytes(chunk_size) / chunk_size;
@@ -295,8 +288,8 @@ struct Slab {
 
   // Randomly walks the bitfield until it finds a free bit. Allocations attempt
   // to put lanes right next to each other for better caching and convergence.
-  void *allocate(uint64_t uniform, uint32_t reserved) {
-    uint32_t chunk_size = get_chunk_size();
+  void *allocate(uint64_t uniform, uint32_t reserved, uint32_t chunk_size) {
+    uint32_t bits = usable_bits(chunk_size);
     uint32_t state = impl::entropy();
 
     // Try to find the empty bit in the bitfield to finish the allocation. We
@@ -308,17 +301,17 @@ struct Slab {
          lane_mask = gpu::ballot(uniform, !result)) {
       if (!result) {
         // Each lane tries to claim one bit in a single contiguous mask.
-        uint32_t id = impl::lane_count(uniform & lane_mask, gpu::get_lane_id());
-        uint32_t index = (start + id) % usable_bits(chunk_size);
+        uint32_t id = impl::lane_count(lane_mask, gpu::get_lane_id());
+        uint32_t index = (start + id) % bits;
         uint32_t slot = index / BITS_IN_WORD;
         uint32_t bit = index % BITS_IN_WORD;
 
         // Get the mask of bits destined for the same slot and coalesce it.
         uint32_t leader = impl::get_leader_id(
-            uniform & gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
+            gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
             gpu::get_lane_id());
-        uint32_t length = cpp::popcount(uniform & lane_mask) -
-                          impl::lane_count(uniform & lane_mask, leader);
+        uint32_t length =
+            cpp::popcount(lane_mask) - impl::lane_count(lane_mask, leader);
         uint32_t bitmask =
             static_cast<uint32_t>(
                 (uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
@@ -415,7 +408,7 @@ struct GuardPtr {
 
     // Returns the current reference count, potentially helping a releasing
     // thread.
-    uint64_t read() {
+    uint32_t read() {
       auto val = counter.load(cpp::MemoryOrder::RELAXED);
       if (val == 0 && RECLAIM &&
           counter.compare_exchange_strong(val, INVALID | HELPED,
@@ -529,7 +522,7 @@ struct GuardPtr {
   }
 
   // Get the current value of the reference counter.
-  uint64_t use_count() { return ref.read(); }
+  uint32_t use_count() { return ref.read(); }
 };
 
 // The global array used to search for a valid slab to allocate from.
@@ -563,8 +556,8 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
         !offset ? start
                 : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
 
-    bool available = !offset || slots[index].use_count() <
-                                    Slab::available_chunks(chunk_size);
+    bool available =
+        !offset || slots[index].use_count() < Slab::usable_bits(chunk_size);
     uint64_t slab_mask = gpu::ballot(lane_mask, !result && available);
     if (slab_mask & impl::id_in_mask()) {
       Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
@@ -574,11 +567,9 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
       // Otherwise, we need to free the claimed lock and continue. In the case
       // of out-of-memory we receive a sentinel value and return a failure.
       uint64_t locked_mask = gpu::ballot(
-          slab_mask, slab && reserved < Slab::available_chunks(chunk_size) &&
+          slab_mask, slab && reserved < Slab::usable_bits(chunk_size) &&
                          slab->get_chunk_size() == chunk_size);
-      uint64_t failed_mask = gpu::ballot(
-          slab_mask, slab && (reserved >= Slab::available_chunks(chunk_size) ||
-                              slab->get_chunk_size() != chunk_size));
+      uint64_t failed_mask = gpu::ballot(slab_mask, slab) & ~locked_mask;
       if (locked_mask & impl::id_in_mask()) {
         if (index != start)
           indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
@@ -605,7 +596,7 @@ void *allocate(uint64_t size) {
 
   // Allocations requiring a full slab or more go directly to memory.
   if (size >= SLAB_SIZE / 2)
-    return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));
+    return impl::rpc_allocate(__builtin_align_up(size, SLAB_SIZE));
 
   // Try to find a slab for the rounded up chunk size and allocate from it.
   uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
@@ -616,7 +607,7 @@ void *allocate(uint64_t size) {
   if (!slab)
     return nullptr;
 
-  void *ptr = slab->allocate(uniform, reserved);
+  void *ptr = slab->allocate(uniform, reserved, chunk_size);
   return ptr;
 }
 
@@ -683,7 +674,7 @@ void *aligned_allocate(uint32_t alignment, uint64_t size) {
   // alignment and then round up. The index logic will round down properly.
   uint64_t rounded = size + alignment - MIN_ALIGNMENT;
   void *ptr = gpu::allocate(rounded);
-  return __builtin_align_up(ptr, alignment);
+  return ptr ? __builtin_align_up(ptr, alignment) : ptr;
 }
 
 } // namespace gpu