[libc-commits] [libc] [libc] Coalesce bitfield access in GPU malloc (PR #142692)
via libc-commits
libc-commits at lists.llvm.org
Tue Jun 3 16:30:39 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
This improves performance by reducing the amount of RMW operations we
need to do to a single slot. This improves repeated allocations without
much contention about ten percent.
---
Full diff: https://github.com/llvm/llvm-project/pull/142692.diff
1 Files Affected:
- (modified) libc/src/__support/GPU/allocator.cpp (+28-12)
``````````diff
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ca68cbcedd48a..59f4b47a3a890 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -129,6 +129,11 @@ static inline constexpr T round_up(const T x) {
return (x + N) & ~(N - 1);
}
+// Branch free minimum of two integers.
+static inline constexpr uint32_t min(const uint32_t &x, const uint32_t &y) {
+ return y ^ ((x ^ y) & -(x < y));
+}
+
} // namespace impl
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -229,24 +234,35 @@ struct Slab {
// The uniform mask represents which lanes contain a uniform target pointer.
// We attempt to place these next to each other.
- // TODO: We should coalesce these bits and use the result of `fetch_or` to
- // search for free bits in parallel.
void *result = nullptr;
for (uint64_t mask = lane_mask; mask;
mask = gpu::ballot(lane_mask, !result)) {
- uint32_t id = impl::lane_count(uniform & mask);
- uint32_t index =
- (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
- usable_bits(chunk_size);
+ if (result)
+ continue;
+
+ uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
+ uint32_t id = impl::lane_count(uniform & mask);
+ uint32_t index = (start + id) % usable_bits(chunk_size);
uint32_t slot = index / BITS_IN_WORD;
uint32_t bit = index % BITS_IN_WORD;
- if (!result) {
- uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
- .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
- if (~before & (1 << bit))
- result = ptr_from_index(index, chunk_size);
- }
+
+ // Get the mask of bits destined for the same slot and coalesce it.
+ uint64_t match = uniform & gpu::match_any(mask, slot);
+ uint32_t bitmask =
+ static_cast<uint32_t>(
+ (1ull << impl::min(cpp::popcount(match), BITS_IN_WORD)) - 1)
+ << bit;
+
+ uint32_t before = 0;
+ if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
+ before = cpp::AtomicRef(get_bitfield()[slot])
+ .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
+ before = gpu::shuffle(mask, cpp::countr_zero(match), before);
+ if (~before & (1 << bit))
+ result = ptr_from_index(index, chunk_size);
+ else
+ sleep_briefly();
}
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
``````````
</details>
https://github.com/llvm/llvm-project/pull/142692
More information about the libc-commits
mailing list