[libc-commits] [libc] [libc] Cache the most recently used slot for a chunk size (PR #149751)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Sun Jul 20 20:48:04 PDT 2025
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/149751
>From 41bb7c8a93a65dd527eae7db894547ae492c6fd5 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 8 Jul 2025 16:08:45 -0500
Subject: [PATCH] [libc] Cache the most recently used slot for a chunk size
Summary:
This patch changes the `find_slab` logic to simply cache the most
successful slot. This means the happy fast path is now a single atomic
load on this index. I removed the SIMT shuffling logic that did slab
lookups wave-parallel. Here I am considering the actual traversal to be
comparatively unlikely, so it's not overly bad that it takes longer.
ideally one thread finds a slot and shared it with the rest so we only
pay that cost once.
---
libc/src/__support/GPU/allocator.cpp | 118 ++++++++++++++-------------
1 file changed, 60 insertions(+), 58 deletions(-)
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 7923fbb2c1c24..23f30ece4461a 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
+// The number of times to attempt claiming an in-progress slab allocation.
+constexpr static uint32_t MAX_TRIES = 128;
+
// A sentinel used to indicate an invalid but non-null pointer value.
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
-// The number of times we will try starting on a single index before skipping
-// past it.
-constexpr static uint32_t MAX_TRIES = 512;
-
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
return state * 0x9e3779bb;
}
-// Final stage of murmurhash used to get a unique index for the global array
-static inline uint32_t hash(uint32_t x) {
- x ^= x >> 16;
- x *= 0x85ebca6b;
- x ^= x >> 13;
- x *= 0xc2b2ae35;
- x ^= x >> 16;
- return x;
-}
-
// Rounds the input value to the closest permitted chunk size. Here we accept
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
-static inline uint32_t get_chunk_size(uint32_t x) {
+static inline constexpr uint32_t get_chunk_size(uint32_t x) {
uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
}
+// Converts a chunk size into an index suitable for a statically sized array.
+static inline constexpr uint32_t get_chunk_id(uint32_t x) {
+ uint32_t y = x >> 4;
+ if (x <= MIN_SIZE)
+ return 0;
+ if (x < MIN_SIZE << 2)
+ return cpp::popcount(y);
+ return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
+}
+
// Rounds to the nearest power of two.
template <uint32_t N, typename T>
static inline constexpr T round_up(const T x) {
@@ -143,6 +142,10 @@ static inline constexpr bool is_pow2(uint64_t x) {
return x && (x & (x - 1)) == 0;
}
+static inline constexpr uint32_t start_index(uint32_t chunk_index) {
+ return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) / ARRAY_SIZE;
+}
+
} // namespace impl
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -451,66 +454,65 @@ struct GuardPtr {
// The global array used to search for a valid slab to allocate from.
static GuardPtr slots[ARRAY_SIZE] = {};
+// Keep a cache of the last successful slot for each chunk size. Initialize it
+// to an even spread of the total size. Must be updated if the chunking scheme
+// changes.
+#define S(X) (impl::start_index(X))
+static cpp::Atomic<uint32_t> indicies[] = {
+ S(16), S(32), S(48), S(64), S(96), S(112), S(128),
+ S(192), S(224), S(256), S(384), S(448), S(512), S(768),
+ S(896), S(1024), S(1536), S(1792), S(2048), S(3072), S(3584),
+ S(4096), S(6144), S(7168), S(8192), S(12288), S(14336), S(16384),
+ S(24576), S(28672), S(32768), S(49152), S(57344), S(65536), S(98304),
+ S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
+ S(524288), S(786432), S(917504), S(1048576)};
+#undef S
+
// Tries to find a slab in the table that can support the given chunk size.
static Slab *find_slab(uint32_t chunk_size) {
// We start at a hashed value to spread out different chunk sizes.
- uint32_t start = impl::hash(chunk_size);
- uint64_t lane_mask = gpu::get_lane_mask();
- uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
-
- Slab *result = nullptr;
- uint32_t nudge = 0;
- for (uint64_t mask = lane_mask; mask;
- mask = gpu::ballot(lane_mask, !result), ++nudge) {
- uint32_t index = cpp::numeric_limits<uint32_t>::max();
- for (uint32_t offset = nudge / MAX_TRIES;
- gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
- offset += cpp::popcount(uniform & lane_mask)) {
- uint32_t candidate =
- (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
- uint64_t available =
- gpu::ballot(lane_mask, slots[candidate].use_count() <
- Slab::available_chunks(chunk_size));
- uint32_t new_index = gpu::shuffle(
- lane_mask, cpp::countr_zero(available & uniform), candidate);
-
- // Each uniform group will use the first empty slot they find.
- if ((index == cpp::numeric_limits<uint32_t>::max() &&
- (available & uniform)))
- index = new_index;
-
- // Guaruntees that this loop will eventuall exit if there is no space.
- if (offset >= ARRAY_SIZE) {
- result = reinterpret_cast<Slab *>(SENTINEL);
- index = 0;
- }
- }
+ uint32_t chunk_id = impl::get_chunk_id(chunk_size);
+ uint32_t start = indicies[chunk_id].load(cpp::MemoryOrder::RELAXED);
+ uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+
+ for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+ uint32_t index =
+ !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
- // Try to claim a slot for the found slot.
- if (!result) {
+ if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+ uint64_t lane_mask = gpu::get_lane_mask();
uint64_t reserved = 0;
- Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+
+ Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
reserved, chunk_size, index);
+
+ // If there is a slab allocation in progress we retry a few times.
+ for (uint32_t retries = 0;
+ retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+ uint64_t lane_mask = gpu::get_lane_mask();
+ slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
+ chunk_size, index);
+ sleep_briefly();
+ }
+
// If we find a slab with a matching chunk size then we store the result.
// Otherwise, we need to free the claimed lock and continue. In the case
- // of out-of-memory we return a sentinel value.
+ // of out-of-memory we recieve a sentinel value and return a failure.
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
slab->get_chunk_size() == chunk_size) {
- result = slab;
+ if (index != start)
+ indicies[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
+ return slab;
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
slab->get_chunk_size() != chunk_size)) {
- if (slab->get_chunk_size() != chunk_size)
- start = index + 1;
slots[index].unlock(gpu::get_lane_mask(),
gpu::get_lane_mask() & uniform);
- } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
- result = reinterpret_cast<Slab *>(SENTINEL);
- } else {
- sleep_briefly();
+ } else if (!slab && reserved == SENTINEL) {
+ return nullptr;
}
}
}
- return result;
+ return nullptr;
}
// Release the lock associated with a given slab.
More information about the libc-commits
mailing list