[libc-commits] [libc] 185f078 - [libc] Improve SIMT control flow in the GPU allocator
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Mon Jan 12 06:24:31 PST 2026
Author: Joseph Huber
Date: 2026-01-12T08:24:17-06:00
New Revision: 185f078a6fbb7ff4d1a7f9ac5d0416a8964e1006
URL: https://github.com/llvm/llvm-project/commit/185f078a6fbb7ff4d1a7f9ac5d0416a8964e1006
DIFF: https://github.com/llvm/llvm-project/commit/185f078a6fbb7ff4d1a7f9ac5d0416a8964e1006.diff
LOG: [libc] Improve SIMT control flow in the GPU allocator
Summary:
The Volta independent thread scheduling is very difficult to work with.
This is a first attempt to make the logic more sound when lanes execute
independently. This isn't all that's required, but it ends up improving
control flow for AMDGPU as well.
Added:
Modified:
libc/src/__support/GPU/allocator.cpp
Removed:
################################################################################
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 813a2a48331cb..7182180ab3613 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -38,7 +38,7 @@ constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
// The number of times to attempt claiming an in-progress slab allocation.
-constexpr static uint32_t MAX_TRIES = 1024;
+constexpr static uint32_t MAX_TRIES = 128;
// The number of previously allocated slabs we will keep in memory.
constexpr static uint32_t CACHED_SLABS = 8;
@@ -136,11 +136,11 @@ static inline constexpr T round_up(const T x) {
}
// Perform a lane parallel memset on a uint32_t pointer.
-void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
- uint64_t mask = gpu::get_lane_mask();
+void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t lane_mask,
+ uint64_t uniform) {
uint32_t workers = cpp::popcount(uniform);
- for (uint32_t i = impl::lane_count(mask & uniform, gpu::get_lane_id()); i < n;
- i += workers)
+ for (uint32_t i = impl::lane_count(lane_mask & uniform, gpu::get_lane_id());
+ i < n; i += workers)
s[i] = c;
}
@@ -176,6 +176,9 @@ template <typename T> bool is_sentinel(const T &x) {
return x == cpp::numeric_limits<T>::max();
}
+// Returns the current lane's position in the lane mask.
+uint64_t id_in_mask() { return 1ull << gpu::get_lane_id(); }
+
} // namespace impl
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -220,14 +223,14 @@ struct Slab {
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
// must be called before the bitfield can be accessed safely, memory is not
// guaranteed to be zero initialized in the current implementation.
- void initialize(uint64_t uniform) {
+ void initialize(uint64_t lane_mask, uint64_t uniform) {
// If this is a re-used slab the memory is already set to zero.
if (get_cached_chunk_size() <= get_chunk_size())
return;
uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
sizeof(uint32_t);
- impl::uniform_memset(get_bitfield(), 0, size, uniform);
+ impl::uniform_memset(get_bitfield(), 0, size, lane_mask, uniform);
}
// Get the number of chunks that can theoretically fit inside this slab.
@@ -495,20 +498,18 @@ struct GuardPtr {
result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
- if (!result)
- return nullptr;
-
// We defer storing the newly allocated slab until now so that we can use
// multiple lanes to initialize it and release it for use.
- if (impl::is_sentinel(count)) {
- result->initialize(uniform);
+ uint64_t slab_mask =
+ gpu::ballot(lane_mask, result && impl::is_sentinel(count));
+ if (slab_mask & impl::id_in_mask()) {
+ result->initialize(slab_mask, uniform);
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
finalize(result, cpp::popcount(uniform), count);
- count =
- gpu::shuffle(gpu::get_lane_mask(), cpp::countr_zero(uniform), count);
+ count = gpu::shuffle(slab_mask, cpp::countr_zero(uniform), count);
}
- if (!impl::is_sentinel(count))
+ if (result)
count = count - cpp::popcount(uniform) +
impl::lane_count(uniform, gpu::get_lane_id());
@@ -553,52 +554,56 @@ static cpp::Atomic<uint32_t> indices[] = {
#undef S
// Tries to find a slab in the table that can support the given chunk size.
-static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
- uint32_t &reserved) {
+static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
+ uint64_t &uniform, uint32_t &reserved) {
// We start at the index of the last successful allocation for this kind.
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
- for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
+ Slab *result = nullptr;
+ for (uint32_t offset = 0;
+ gpu::ballot(lane_mask, !result) && offset <= ARRAY_SIZE; ++offset) {
uint32_t index =
!offset ? start
: (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
- if (!offset ||
- slots[index].use_count() < Slab::available_chunks(chunk_size)) {
- uint64_t lane_mask = gpu::get_lane_mask();
-
- Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
+ bool available = !offset || slots[index].use_count() <
+ Slab::available_chunks(chunk_size);
+ uint64_t slab_mask = gpu::ballot(uniform, !result && available);
+ if (slab_mask & impl::id_in_mask()) {
+ Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
reserved, chunk_size, index);
// If we find a slab with a matching chunk size then we store the result.
// Otherwise, we need to free the claimed lock and continue. In the case
// of out-of-memory we receive a sentinel value and return a failure.
- if (slab && reserved < Slab::available_chunks(chunk_size) &&
- slab->get_chunk_size() == chunk_size) {
+ uint64_t locked_mask = gpu::ballot(
+ slab_mask, slab && reserved < Slab::available_chunks(chunk_size) &&
+ slab->get_chunk_size() == chunk_size);
+ uint64_t failed_mask = gpu::ballot(
+ slab_mask, slab && (reserved >= Slab::available_chunks(chunk_size) ||
+ slab->get_chunk_size() != chunk_size));
+ if (locked_mask & impl::id_in_mask()) {
if (index != start)
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
- uniform = uniform & gpu::get_lane_mask();
- return slab;
- } else if (slab && (reserved >= Slab::available_chunks(chunk_size) ||
- slab->get_chunk_size() != chunk_size)) {
- slots[index].unlock(gpu::get_lane_mask(),
- gpu::get_lane_mask() & uniform);
+ uniform = uniform & locked_mask;
+ result = slab;
+ } else if (failed_mask & impl::id_in_mask()) {
+ slots[index].unlock(failed_mask, failed_mask & uniform);
} else if (!slab && impl::is_sentinel(reserved)) {
- uniform = uniform & gpu::get_lane_mask();
- return nullptr;
+ result =
+ reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max());
} else {
sleep_briefly();
}
}
}
- return nullptr;
+ return !impl::is_sentinel(result) ? result : nullptr;
}
// Release the lock associated with a given slab.
-static void release_slab(Slab *slab) {
+static void release_slab(uint64_t lane_mask, Slab *slab) {
uint32_t index = slab->get_global_index();
- uint64_t lane_mask = gpu::get_lane_mask();
uint64_t uniform = gpu::match_any(lane_mask, index);
slots[index].unlock(lane_mask, uniform);
}
@@ -615,9 +620,10 @@ void *allocate(uint64_t size) {
// Try to find a slab for the rounded up chunk size and allocate from it.
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
- uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+ uint64_t lane_mask = gpu::get_lane_mask();
+ uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
uint32_t reserved = 0;
- Slab *slab = find_slab(chunk_size, uniform, reserved);
+ Slab *slab = find_slab(chunk_size, lane_mask, uniform, reserved);
if (!slab)
return nullptr;
@@ -634,10 +640,11 @@ void deallocate(void *ptr) {
return impl::rpc_free(ptr);
// The original slab pointer is the 2MiB boundary using the given pointer.
+ uint64_t lane_mask = gpu::get_lane_mask();
Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
(reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
slab->deallocate(ptr);
- release_slab(slab);
+ release_slab(lane_mask, slab);
}
void *reallocate(void *ptr, uint64_t size) {
More information about the libc-commits
mailing list