[libc-commits] [libc] 5dc9937 - [libc] Improve starting indices for GPU allocation (#150432)

Mon Jul 28 05:54:52 PDT 2025

Author: Joseph Huber
Date: 2025-07-28T07:54:48-05:00
New Revision: 5dc9937ea910f807d3e7325669053c5740545875

URL: https://github.com/llvm/llvm-project/commit/5dc9937ea910f807d3e7325669053c5740545875
DIFF: https://github.com/llvm/llvm-project/commit/5dc9937ea910f807d3e7325669053c5740545875.diff

LOG: [libc] Improve starting indices for GPU allocation (#150432)

Summary:
The slots in this allocation scheme are statically allocated. All sizes
share the same array of slots, but are given different starting
locations to space them apart. The previous implementation used a
trivial linear slice. This is inefficient because it provides the more
likely allocations (1-1024 bytes) with just as much space as a highly
unlikely one (1 MiB).

This patch uses a cubic easing function to gradually shrink the gaps.
For example, we used to get around 700 free slots for a 16 byte
allocation, now we get around 2100 before it starts encroaching on the
32 byte allocation space. This could be improved further, but I think
this is sufficient.

Added: 
    

Modified: 
    libc/src/__support/GPU/allocator.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 866aea7b69d4e..f115d2a26606a 100644

--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -142,10 +142,16 @@ static inline constexpr bool is_pow2(uint64_t x) {
   return x && (x & (x - 1)) == 0;
 }
 
-// Where this chunk size should start looking in the global array.
-static inline constexpr uint32_t start_index(uint32_t chunk_index) {
-  return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
-         impl::get_chunk_id(SLAB_SIZE / 2);
+// Where this chunk size should start looking in the global array. Small
+// allocations are much more likely than large ones, so we give them the most
+// space. We use a cubic easing function normalized on the possible chunks.
+static inline constexpr uint32_t start_index(uint32_t chunk_size) {
+  constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
+  uint64_t norm =
+      (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
+  uint64_t bias = (norm * norm * norm) >> 32;
+  uint64_t inv = (1 << 16) - bias;
+  return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
 }
 
 } // namespace impl
@@ -487,9 +493,10 @@ static Slab *find_slab(uint32_t chunk_size) {
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
   uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
 
-  for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+  for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
     uint32_t index =
-        !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
+        !offset ? start
+                : (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
 
     if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
       uint64_t lane_mask = gpu::get_lane_mask();