[libc-commits] [libc] 5634484 - [libc] Hoist GPU allocator loop invariants from find_slab (#184803)

Thu Mar 5 06:36:55 PST 2026

Author: Joseph Huber
Date: 2026-03-05T08:36:50-06:00
New Revision: 5634484b32df726270224814315253c8eb17fb62

URL: https://github.com/llvm/llvm-project/commit/5634484b32df726270224814315253c8eb17fb62
DIFF: https://github.com/llvm/llvm-project/commit/5634484b32df726270224814315253c8eb17fb62.diff

LOG: [libc] Hoist GPU allocator loop invariants from find_slab (#184803)

Summary:
This improves performance as these variables were not eligible for LICM
apparently.

Added: 
    

Modified: 
    libc/src/__support/GPU/allocator.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 24f98f1b8d08d..d8013d26566ff 100644

--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -548,34 +548,34 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
   // We start at the index of the last successful allocation for this kind.
   uint32_t chunk_id = impl::get_chunk_id(chunk_size);
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
+  uint32_t usable = Slab::usable_bits(chunk_size);
+  uint32_t base = impl::get_start_index(chunk_size);
+  uint64_t id = impl::id_in_mask();
 
   Slab *result = nullptr;
   for (uint32_t offset = 0;
        gpu::ballot(lane_mask, !result) && offset <= ARRAY_SIZE; ++offset) {
-    uint32_t index =
-        !offset ? start
-                : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
+    uint32_t index = !offset ? start : (base + offset - 1) % ARRAY_SIZE;
 
-    bool available =
-        !offset || slots[index].use_count() < Slab::usable_bits(chunk_size);
+    bool available = !offset || slots[index].use_count() < usable;
     uint64_t slab_mask = gpu::ballot(lane_mask, !result && available);
-    if (slab_mask & impl::id_in_mask()) {
+    if (slab_mask & id) {
       Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
                                          reserved, chunk_size, index);
 
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
       // of out-of-memory we receive a sentinel value and return a failure.
-      uint64_t locked_mask = gpu::ballot(
-          slab_mask, slab && reserved < Slab::usable_bits(chunk_size) &&
-                         slab->get_chunk_size() == chunk_size);
+      uint64_t locked_mask =
+          gpu::ballot(slab_mask, slab && reserved < usable &&
+                                     slab->get_chunk_size() == chunk_size);
       uint64_t failed_mask = gpu::ballot(slab_mask, slab) & ~locked_mask;
-      if (locked_mask & impl::id_in_mask()) {
+      if (locked_mask & id) {
         if (index != start)
           indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
         uniform = uniform & locked_mask;
         result = slab;
-      } else if (failed_mask & impl::id_in_mask()) {
+      } else if (failed_mask & id) {
         slots[index].unlock(failed_mask & uniform);
       } else if (!slab && impl::is_sentinel(reserved)) {
         result =