[libc-commits] [libc] 6cc42b3 - [libc] Various GPU allocator tweaks and optimizations (#184368)
via libc-commits
libc-commits at lists.llvm.org
Tue Mar 3 07:59:23 PST 2026
Author: Joseph Huber
Date: 2026-03-03T09:59:02-06:00
New Revision: 6cc42b39556d33a968a899fd3243bcb707ae7169
URL: https://github.com/llvm/llvm-project/commit/6cc42b39556d33a968a899fd3243bcb707ae7169
DIFF: https://github.com/llvm/llvm-project/commit/6cc42b39556d33a968a899fd3243bcb707ae7169.diff
LOG: [libc] Various GPU allocator tweaks and optimizations (#184368)
Summary:
Some low-hanging fruit tweaks. Mostly preventing redundant loads and
unnecessary widening. Some fixes as well, like nullptr handling,
incorrect rounding, and oversized bitfields.
Added:
Modified:
libc/src/__support/GPU/allocator.cpp
Removed:
################################################################################
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 02fdcd9759ffe..24f98f1b8d08d 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -124,15 +124,9 @@ static inline constexpr uint32_t get_chunk_id(uint32_t x) {
return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
}
-// Rounds to the nearest power of two.
-template <uint32_t N, typename T>
-static inline constexpr T round_up(const T x) {
- static_assert(((N - 1) & N) == 0, "N must be a power of two");
- return (x + N) & ~(N - 1);
-}
-
// Perform a lane parallel memset on a uint32_t pointer.
-void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t lane_mask) {
+static inline void uniform_memset(uint32_t *s, uint32_t c, uint32_t n,
+ uint64_t lane_mask) {
uint32_t workers = cpp::popcount(lane_mask);
for (uint32_t i = impl::lane_count(lane_mask, gpu::get_lane_id()); i < n;
i += workers)
@@ -223,9 +217,8 @@ struct Slab {
if (get_cached_chunk_size() <= get_chunk_size())
return;
- uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
- sizeof(uint32_t);
- impl::uniform_memset(get_bitfield(), 0, size, lane_mask);
+ impl::uniform_memset(get_bitfield(), 0, bitfield_words(get_chunk_size()),
+ lane_mask);
}
// Get the number of chunks that can theoretically fit inside this slab.
@@ -233,11 +226,16 @@ struct Slab {
return SLAB_SIZE / chunk_size;
}
- // Get the number of bytes needed to contain the bitfield bits.
+ // Get the number of uint32_t words needed for the bitfield.
+ constexpr static uint32_t bitfield_words(uint32_t chunk_size) {
+ return (num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD;
+ }
+
+ // Get the number of bytes reserved for the bitfield region with padding.
constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
- return __builtin_align_up(
- ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8,
- MIN_ALIGNMENT + 1);
+ return __builtin_align_up(bitfield_words(chunk_size) *
+ uint32_t(sizeof(uint32_t)),
+ __GCC_DESTRUCTIVE_SIZE << 1);
}
// The actual amount of memory available excluding the bitfield and metadata.
@@ -245,11 +243,6 @@ struct Slab {
return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
}
- // The number of chunks that can be stored in this slab.
- constexpr static uint32_t available_chunks(uint32_t chunk_size) {
- return available_bytes(chunk_size) / chunk_size;
- }
-
// The length in bits of the bitfield.
constexpr static uint32_t usable_bits(uint32_t chunk_size) {
return available_bytes(chunk_size) / chunk_size;
@@ -295,8 +288,8 @@ struct Slab {
// Randomly walks the bitfield until it finds a free bit. Allocations attempt
// to put lanes right next to each other for better caching and convergence.
- void *allocate(uint64_t uniform, uint32_t reserved) {
- uint32_t chunk_size = get_chunk_size();
+ void *allocate(uint64_t uniform, uint32_t reserved, uint32_t chunk_size) {
+ uint32_t bits = usable_bits(chunk_size);
uint32_t state = impl::entropy();
// Try to find the empty bit in the bitfield to finish the allocation. We
@@ -308,17 +301,17 @@ struct Slab {
lane_mask = gpu::ballot(uniform, !result)) {
if (!result) {
// Each lane tries to claim one bit in a single contiguous mask.
- uint32_t id = impl::lane_count(uniform & lane_mask, gpu::get_lane_id());
- uint32_t index = (start + id) % usable_bits(chunk_size);
+ uint32_t id = impl::lane_count(lane_mask, gpu::get_lane_id());
+ uint32_t index = (start + id) % bits;
uint32_t slot = index / BITS_IN_WORD;
uint32_t bit = index % BITS_IN_WORD;
// Get the mask of bits destined for the same slot and coalesce it.
uint32_t leader = impl::get_leader_id(
- uniform & gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
+ gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
gpu::get_lane_id());
- uint32_t length = cpp::popcount(uniform & lane_mask) -
- impl::lane_count(uniform & lane_mask, leader);
+ uint32_t length =
+ cpp::popcount(lane_mask) - impl::lane_count(lane_mask, leader);
uint32_t bitmask =
static_cast<uint32_t>(
(uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
@@ -415,7 +408,7 @@ struct GuardPtr {
// Returns the current reference count, potentially helping a releasing
// thread.
- uint64_t read() {
+ uint32_t read() {
auto val = counter.load(cpp::MemoryOrder::RELAXED);
if (val == 0 && RECLAIM &&
counter.compare_exchange_strong(val, INVALID | HELPED,
@@ -529,7 +522,7 @@ struct GuardPtr {
}
// Get the current value of the reference counter.
- uint64_t use_count() { return ref.read(); }
+ uint32_t use_count() { return ref.read(); }
};
// The global array used to search for a valid slab to allocate from.
@@ -563,8 +556,8 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
!offset ? start
: (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
- bool available = !offset || slots[index].use_count() <
- Slab::available_chunks(chunk_size);
+ bool available =
+ !offset || slots[index].use_count() < Slab::usable_bits(chunk_size);
uint64_t slab_mask = gpu::ballot(lane_mask, !result && available);
if (slab_mask & impl::id_in_mask()) {
Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
@@ -574,11 +567,9 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
// Otherwise, we need to free the claimed lock and continue. In the case
// of out-of-memory we receive a sentinel value and return a failure.
uint64_t locked_mask = gpu::ballot(
- slab_mask, slab && reserved < Slab::available_chunks(chunk_size) &&
+ slab_mask, slab && reserved < Slab::usable_bits(chunk_size) &&
slab->get_chunk_size() == chunk_size);
- uint64_t failed_mask = gpu::ballot(
- slab_mask, slab && (reserved >= Slab::available_chunks(chunk_size) ||
- slab->get_chunk_size() != chunk_size));
+ uint64_t failed_mask = gpu::ballot(slab_mask, slab) & ~locked_mask;
if (locked_mask & impl::id_in_mask()) {
if (index != start)
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
@@ -605,7 +596,7 @@ void *allocate(uint64_t size) {
// Allocations requiring a full slab or more go directly to memory.
if (size >= SLAB_SIZE / 2)
- return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));
+ return impl::rpc_allocate(__builtin_align_up(size, SLAB_SIZE));
// Try to find a slab for the rounded up chunk size and allocate from it.
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
@@ -616,7 +607,7 @@ void *allocate(uint64_t size) {
if (!slab)
return nullptr;
- void *ptr = slab->allocate(uniform, reserved);
+ void *ptr = slab->allocate(uniform, reserved, chunk_size);
return ptr;
}
@@ -683,7 +674,7 @@ void *aligned_allocate(uint32_t alignment, uint64_t size) {
// alignment and then round up. The index logic will round down properly.
uint64_t rounded = size + alignment - MIN_ALIGNMENT;
void *ptr = gpu::allocate(rounded);
- return __builtin_align_up(ptr, alignment);
+ return ptr ? __builtin_align_up(ptr, alignment) : ptr;
}
} // namespace gpu
More information about the libc-commits
mailing list