[libc-commits] [libc] [libc] Hoist GPU allocator loop invariants from find_slab (PR #184803)

Thu Mar 5 06:36:49 PST 2026

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/184803

Summary:
This improves performance as these variables were not eligible for LICM
apparently.

>From a7e66c7ce21d4ae73ae111bb8c9e3576cabd34d1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 4 Mar 2026 18:31:02 -0600
Subject: [PATCH] [libc] Hoist GPU allocator loop invariants from find_slab

Summary:
This improves performance as these variables were not eligible for LICM
apparently.
---
 libc/src/__support/GPU/allocator.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 24f98f1b8d08d..d8013d26566ff 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -548,34 +548,34 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
   // We start at the index of the last successful allocation for this kind.
   uint32_t chunk_id = impl::get_chunk_id(chunk_size);
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
+  uint32_t usable = Slab::usable_bits(chunk_size);
+  uint32_t base = impl::get_start_index(chunk_size);
+  uint64_t id = impl::id_in_mask();
 
   Slab *result = nullptr;
   for (uint32_t offset = 0;
        gpu::ballot(lane_mask, !result) && offset <= ARRAY_SIZE; ++offset) {
-    uint32_t index =
-        !offset ? start
-                : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
+    uint32_t index = !offset ? start : (base + offset - 1) % ARRAY_SIZE;
 
-    bool available =
-        !offset || slots[index].use_count() < Slab::usable_bits(chunk_size);
+    bool available = !offset || slots[index].use_count() < usable;
     uint64_t slab_mask = gpu::ballot(lane_mask, !result && available);
-    if (slab_mask & impl::id_in_mask()) {
+    if (slab_mask & id) {
       Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
                                          reserved, chunk_size, index);
 
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
       // of out-of-memory we receive a sentinel value and return a failure.
-      uint64_t locked_mask = gpu::ballot(
-          slab_mask, slab && reserved < Slab::usable_bits(chunk_size) &&
-                         slab->get_chunk_size() == chunk_size);
+      uint64_t locked_mask =
+          gpu::ballot(slab_mask, slab && reserved < usable &&
+                                     slab->get_chunk_size() == chunk_size);
       uint64_t failed_mask = gpu::ballot(slab_mask, slab) & ~locked_mask;
-      if (locked_mask & impl::id_in_mask()) {
+      if (locked_mask & id) {
         if (index != start)
           indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
         uniform = uniform & locked_mask;
         result = slab;
-      } else if (failed_mask & impl::id_in_mask()) {
+      } else if (failed_mask & id) {
         slots[index].unlock(failed_mask & uniform);
       } else if (!slab && impl::is_sentinel(reserved)) {
         result =