[libc-commits] [libc] [libc] Cache the most recently used slot for a chunk size (PR #149751)

Mon Jul 21 00:51:31 PDT 2025

================
@@ -451,66 +454,65 @@ struct GuardPtr {
 // The global array used to search for a valid slab to allocate from.
 static GuardPtr slots[ARRAY_SIZE] = {};
 
+// Keep a cache of the last successful slot for each chunk size. Initialize it
+// to an even spread of the total size. Must be updated if the chunking scheme
+// changes.
+#define S(X) (impl::start_index(X))
+static cpp::Atomic<uint32_t> indicies[] = {
+    S(16),     S(32),     S(48),     S(64),     S(96),     S(112),    S(128),
+    S(192),    S(224),    S(256),    S(384),    S(448),    S(512),    S(768),
+    S(896),    S(1024),   S(1536),   S(1792),   S(2048),   S(3072),   S(3584),
+    S(4096),   S(6144),   S(7168),   S(8192),   S(12288),  S(14336),  S(16384),
+    S(24576),  S(28672),  S(32768),  S(49152),  S(57344),  S(65536),  S(98304),
+    S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
+    S(524288), S(786432), S(917504), S(1048576)};
+#undef S
+
 // Tries to find a slab in the table that can support the given chunk size.
 static Slab *find_slab(uint32_t chunk_size) {
   // We start at a hashed value to spread out different chunk sizes.
-  uint32_t start = impl::hash(chunk_size);
-  uint64_t lane_mask = gpu::get_lane_mask();
-  uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
-
-  Slab *result = nullptr;
-  uint32_t nudge = 0;
-  for (uint64_t mask = lane_mask; mask;
-       mask = gpu::ballot(lane_mask, !result), ++nudge) {
-    uint32_t index = cpp::numeric_limits<uint32_t>::max();
-    for (uint32_t offset = nudge / MAX_TRIES;
-         gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
-         offset += cpp::popcount(uniform & lane_mask)) {
-      uint32_t candidate =
-          (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
-      uint64_t available =
-          gpu::ballot(lane_mask, slots[candidate].use_count() <
-                                     Slab::available_chunks(chunk_size));
-      uint32_t new_index = gpu::shuffle(
-          lane_mask, cpp::countr_zero(available & uniform), candidate);
-
-      // Each uniform group will use the first empty slot they find.
-      if ((index == cpp::numeric_limits<uint32_t>::max() &&
-           (available & uniform)))
-        index = new_index;
-
-      // Guaruntees that this loop will eventuall exit if there is no space.
-      if (offset >= ARRAY_SIZE) {
-        result = reinterpret_cast<Slab *>(SENTINEL);
-        index = 0;
-      }
-    }
+  uint32_t chunk_id = impl::get_chunk_id(chunk_size);
+  uint32_t start = indicies[chunk_id].load(cpp::MemoryOrder::RELAXED);
+  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+
+  for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+    uint32_t index =
+        !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
 
-    // Try to claim a slot for the found slot.
-    if (!result) {
+    if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+      uint64_t lane_mask = gpu::get_lane_mask();
       uint64_t reserved = 0;
-      Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+
+      Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                          reserved, chunk_size, index);
+
+      // If there is a slab allocation in progress we retry a few times.
+      for (uint32_t retries = 0;
+           retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+        uint64_t lane_mask = gpu::get_lane_mask();
+        slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
+                                     chunk_size, index);
+        sleep_briefly();
+      }
+
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
-      // of out-of-memory we return a sentinel value.
+      // of out-of-memory we recieve a sentinel value and return a failure.
----------------
arsenm wrote:

```suggestion
      // of out-of-memory we receive a sentinel value and return a failure.
```

https://github.com/llvm/llvm-project/pull/149751