[libc-commits] [libc] 9975dfd - [libc] Small performance improvements to GPU allocator

Mon Jul 28 07:23:36 PDT 2025

Author: Joseph Huber
Date: 2025-07-28T09:23:29-05:00
New Revision: 9975dfdf800d9881b704a988bc004ec81639fe67

URL: https://github.com/llvm/llvm-project/commit/9975dfdf800d9881b704a988bc004ec81639fe67
DIFF: https://github.com/llvm/llvm-project/commit/9975dfdf800d9881b704a988bc004ec81639fe67.diff

LOG: [libc] Small performance improvements to GPU allocator

Summary:
This slightly increases performance in a few places. First, we
optimistically assume the cached slab has ample space which lets us
avoid the atomic load on the highly contended counter in the case that
it is likely to succeed. Second, we no longer call `match_any` twice as
we can calculate the uniform slabs at the moment we return them.
Thirdly, we always choose a random index on a 32-bit boundary. This
means that in the fast case we fulfil the allocation with a single
`fetch_or`, and in the other case we quickly move to the free bit.

This nets around a 7.75% improvement for the fast path case.

Added: 
    

Modified: 
    libc/src/__support/GPU/allocator.cpp
    libc/test/integration/src/stdlib/gpu/malloc_stress.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index f115d2a26606a..5af7e3aab6acb 100644

--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -145,7 +145,7 @@ static inline constexpr bool is_pow2(uint64_t x) {
 // Where this chunk size should start looking in the global array. Small
 // allocations are much more likely than large ones, so we give them the most
 // space. We use a cubic easing function normalized on the possible chunks.
-static inline constexpr uint32_t start_index(uint32_t chunk_size) {
+static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
   constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
   uint64_t norm =
       (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
@@ -270,10 +270,10 @@ struct Slab {
         continue;
 
       // We try using any known empty bits from the previous attempt first.
-      uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask),
-                                    ~after ? (old_index & ~(BITS_IN_WORD - 1)) +
-                                                 cpp::countr_zero(~after)
-                                           : impl::xorshift32(state));
+      uint32_t start = gpu::shuffle(
+          mask, cpp::countr_zero(uniform & mask),
+          ~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
+                 : __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
 
       uint32_t id = impl::lane_count(uniform & mask);
       uint32_t index = (start + id) % usable_bits(chunk_size);
@@ -475,7 +475,7 @@ static GuardPtr slots[ARRAY_SIZE] = {};
 // Keep a cache of the last successful slot for each chunk size. Initialize it
 // to an even spread of the total size. Must be updated if the chunking scheme
 // changes.
-#define S(X) (impl::start_index(X))
+#define S(X) (impl::get_start_index(X))
 static cpp::Atomic<uint32_t> indices[] = {
     S(16),     S(32),     S(48),     S(64),     S(96),     S(112),    S(128),
     S(192),    S(224),    S(256),    S(384),    S(448),    S(512),    S(768),
@@ -487,18 +487,18 @@ static cpp::Atomic<uint32_t> indices[] = {
 #undef S
 
 // Tries to find a slab in the table that can support the given chunk size.
-static Slab *find_slab(uint32_t chunk_size) {
+static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
   // We start at the index of the last successful allocation for this kind.
   uint32_t chunk_id = impl::get_chunk_id(chunk_size);
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
-  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
 
   for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
     uint32_t index =
         !offset ? start
-                : (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
+                : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
 
-    if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+    if (!offset ||
+        slots[index].use_count() < Slab::available_chunks(chunk_size)) {
       uint64_t lane_mask = gpu::get_lane_mask();
       uint64_t reserved = 0;
 
@@ -521,13 +521,17 @@ static Slab *find_slab(uint32_t chunk_size) {
           slab->get_chunk_size() == chunk_size) {
         if (index != start)
           indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
+        uniform = uniform & gpu::get_lane_mask();
         return slab;
       } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
                           slab->get_chunk_size() != chunk_size)) {
         slots[index].unlock(gpu::get_lane_mask(),
                             gpu::get_lane_mask() & uniform);
       } else if (!slab && reserved == SENTINEL) {
+        uniform = uniform & gpu::get_lane_mask();
         return nullptr;
+      } else {
+        sleep_briefly();
       }
     }
   }
@@ -554,12 +558,12 @@ void *allocate(uint64_t size) {
 
   // Try to find a slab for the rounded up chunk size and allocate from it.
   uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
-  Slab *slab = find_slab(chunk_size);
+  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+  Slab *slab = find_slab(chunk_size, uniform);
   if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
     return nullptr;
 
   uint64_t lane_mask = gpu::get_lane_mask();
-  uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
   void *ptr = slab->allocate(lane_mask, uniform);
   return ptr;
 }

diff  --git a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
index 77479f85dc5cc..1c954fea31d49 100644
--- a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
+++ b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
@@ -14,6 +14,20 @@
 
 using namespace LIBC_NAMESPACE;
 
+static inline uint32_t entropy() {
+  return (static_cast<uint32_t>(gpu::processor_clock()) ^
+          (gpu::get_thread_id_x() * 0x632be59b) ^
+          (gpu::get_block_id_x() * 0x85157af5)) *
+         0x9e3779bb;
+}
+
+static inline uint32_t xorshift32(uint32_t &state) {
+  state ^= state << 13;
+  state ^= state >> 17;
+  state ^= state << 5;
+  return state * 0x9e3779bb;
+}
+
 static inline void use(uint8_t *ptr, uint32_t size) {
   EXPECT_NE(ptr, nullptr);
   for (int i = 0; i < size; ++i)
@@ -34,5 +48,16 @@ TEST_MAIN(int, char **, char **) {
 
   for (int i = 0; i < 256; ++i)
     free(ptrs[i]);
+
+  uint32_t state = entropy();
+  for (int i = 0; i < 1024; ++i) {
+    if (xorshift32(state) % 2) {
+      uint64_t size = xorshift32(state) % 256 + 1;
+      void *ptr = malloc(size);
+      ASSERT_TRUE(ptr);
+      ASSERT_TRUE(__builtin_is_aligned(ptr, 16));
+      free(ptr);
+    }
+  }
   return 0;
 }