[libc-commits] [libc] [libc] Implement efficient 'malloc' on the GPU (PR #140156)

Fri May 16 18:47:44 PDT 2025

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/140156

>From 2cfe7360979e68b1b7cd595745182aa7f534818d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 15 May 2025 17:46:47 -0500
Subject: [PATCH 01/10] [libc] Implement efficient 'malloc' on the GPU

Summary:
This is the big patch that implements an efficient device-side `malloc`
on the GPU. This is the first pass and many improvements will be made
later.

The scheme revolves around using a global reference counted pointer to
hand out access to a dynamically created and destroyed slab interface.
The slab is simply a large bitfield with one bit for each slab. All
allocations are the same size in a slab, so different sized allocations
are done through different slabs.

Allocation is thus searching for or creating a slab for the desired
slab, reserving space, and then searching for a free bit. Freeing is
clearing the bit and then releasing the space.

This interface allows memory to dynamically grow and shrink. Future
patches will have different modes to allow fast first-time-use as well
as a non-RPC version.
---
 libc/src/__support/GPU/CMakeLists.txt         |   3 +
 libc/src/__support/GPU/allocator.cpp          | 437 +++++++++++++++++-
 .../integration/src/stdlib/CMakeLists.txt     |   3 +
 .../integration/src/stdlib/gpu/CMakeLists.txt |  19 +
 .../integration/src/stdlib/gpu/malloc.cpp     |  31 ++
 5 files changed, 486 insertions(+), 7 deletions(-)
 create mode 100644 libc/test/integration/src/stdlib/gpu/CMakeLists.txt
 create mode 100644 libc/test/integration/src/stdlib/gpu/malloc.cpp

diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 9b359f65cdb33..4ffee011be961 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -18,5 +18,8 @@ add_object_library(
   DEPENDS
     libc.src.__support.common
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.new
     .utils
 )
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ac335a1b9aab0..d26422ff79fc8 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -5,17 +5,37 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// This file implements a parallel allocator intended for use on a GPU device.
+// The core algorithm is slab allocator using a random walk over a bitfield for
+// maximum parallel progress. Slab handling is done by a wait-free reference
+// counted guard. The first use of a slab will create it from system memory for
+// re-use. The last use will invalidate it and free the memory.
+//
+//===----------------------------------------------------------------------===//
 
 #include "allocator.h"
 
+#include "src/__support/CPP/atomic.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/new.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/threads/sleep.h"
 
 namespace LIBC_NAMESPACE_DECL {
-namespace {
 
-void *rpc_allocate(uint64_t size) {
+constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
+constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
+constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
+constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
+
+static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
+
+namespace impl {
+// Allocates more memory from the system through the RPC interface. All
+// allocations from the system MUST be aligned on a 2MiB barrier.
+static void *rpc_allocate(uint64_t size) {
   void *ptr = nullptr;
   rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
   port.send_and_recv(
@@ -27,7 +47,8 @@ void *rpc_allocate(uint64_t size) {
   return ptr;
 }
 
-void rpc_free(void *ptr) {
+// Deallocates the associated system memory.
+static void rpc_free(void *ptr) {
   rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
   port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
@@ -35,13 +56,415 @@ void rpc_free(void *ptr) {
   port.close();
 }
 
-} // namespace
+// Convert a potentially disjoint bitmask into an increasing integer for use
+// with indexing.
+static inline uint32_t lane_count(uint64_t lane_mask) {
+  return cpp::popcount(lane_mask & ((1ull << gpu::get_lane_id()) - 1));
+}
+
+// Obtain an initial value to seed a random number generator.
+static inline uint32_t entropy() {
+  return (static_cast<uint32_t>(gpu::processor_clock()) ^
+          (gpu::get_thread_id_x() * 0x632be59b) ^
+          (gpu::get_block_id_x() * 0x85157af5)) *
+         0x9e3779bb;
+}
+
+// Generate a random number and update the state using the xorshift32 PRNG.
+static inline uint32_t xorshift32(uint32_t &state) {
+  state ^= state << 13;
+  state ^= state >> 17;
+  state ^= state << 5;
+  return state * 0x9e3779bb;
+}
+
+// Final stage of murmurhash used to get a unique index for the global array
+static inline uint32_t hash(uint32_t x) {
+  x ^= x >> 16;
+  x *= 0x85ebca6b;
+  x ^= x >> 13;
+  x *= 0xc2b2ae35;
+  x ^= x >> 16;
+  return x;
+}
+
+// Rounds the input value to the closest permitted chunk size. Here we accept
+// the sum of the closest three powers of two. For a 2MiB slab size this is 48
+// different chunk sizes.
+static inline uint32_t get_chunk_size(uint32_t x) {
+  uint32_t y = x < 16 ? 16 : x;
+  uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
+
+  uint32_t s0 = 0b0100 << (pow2 - 3);
+  uint32_t s1 = 0b0110 << (pow2 - 3);
+  uint32_t s2 = 0b0111 << (pow2 - 3);
+  uint32_t s3 = 0b1000 << (pow2 - 3);
+
+  if (s0 > y)
+    return (s0 + 15) & ~15;
+  else if (s1 > y)
+    return (s1 + 15) & ~15;
+  else if (s2 > y)
+    return (s2 + 15) & ~15;
+  return (s3 + 15) & ~15;
+}
+
+} // namespace impl
+
+/// A slab allocator used to hand out indentically sized slabs of memory.
+/// Allocation is done through random walks of a bitfield until a free bit is
+/// encountered. This reduces contention and is highly parallel on a GPU.
+struct Slab {
+
+  // Initialize the slab with its chunk size and index in the global table for
+  // use when freeing.
+  Slab(uint32_t chunk_size, uint32_t global_index) {
+    get_chunk_size() = chunk_size;
+    get_global_index() = global_index;
+
+    // This memset is expensive and likely not necessary for the current 'kfd'
+    // driver. Until zeroed pages are exposed by the API we must be careful.
+    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  }
+
+  // Get the number of chunks that can theoretically fit inside this array.
+  static uint32_t num_chunks(uint32_t chunk_size) {
+    return SLAB_SIZE / chunk_size;
+  }
+
+  // Get the number of bytes needed to contain the bitfield bits.
+  static uint32_t bitfield_bytes(uint32_t chunk_size) {
+    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) *
+           sizeof(uint32_t);
+  }
+
+  // The actual amount of memory available excluding the bitfield and metadata.
+  static uint32_t available_bytes(uint32_t chunk_size) {
+    return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - 4 * sizeof(uint32_t);
+  }
+
+  // The number of chunks that can be stored in this slab.
+  static uint32_t available_chunks(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // The length in bits of the bitfield.
+  static uint32_t usable_bits(uint32_t chunk_size) {
+    return ((available_bytes(chunk_size) + chunk_size - 1) / chunk_size);
+  }
+
+  // Get the location in the memory where we will store the chunk size.
+  uint32_t &get_chunk_size() { return *reinterpret_cast<uint32_t *>(memory); }
+
+  // Get the location in the memory where we will store the global index.
+  uint32_t &get_global_index() {
+    return *reinterpret_cast<uint32_t *>(memory + sizeof(uint32_t));
+  }
+
+  // Get a pointer to where the bitfield is located in the memory.
+  uint32_t *get_bitfield() {
+    return reinterpret_cast<uint32_t *>(memory + 4 * sizeof(uint32_t));
+  }
+
+  // Get a pointer to where the actual memory to be allocated lives.
+  uint8_t *get_memory(uint32_t chunk_size) {
+    return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
+           4 * sizeof(uint32_t);
+  }
+
+  // Get a pointer to the actual memory given an index into the bitfield.
+  void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
+    return get_memory(chunk_size) + index * chunk_size;
+  }
+
+  // Convert a pointer back into its bitfield index using its offset.
+  uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
+    return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
+                                 get_memory(chunk_size)) /
+           chunk_size;
+  }
+
+  // Randomly walks the bitfield until it finds a free bit in the bitfield.
+  // Allocations attempt to put lanes right next to eachother for better
+  // caching and convergence.
+  void *allocate(uint64_t lane_mask, uint64_t uniform) {
+    uint32_t chunk_size = get_chunk_size();
+    uint32_t *bitfield = get_bitfield();
+    uint32_t state = impl::entropy();
+    void *result = nullptr;
+    // The uniform mask represents which lanes contain a uniform target pointer.
+    // We attempt to place these next to eachother in the bitfield.
+    // TODO: We should coalesce these bits and use the result of `fetch_or` to
+    //       search for free bits in parallel.
+    for (uint64_t mask = ~0ull; mask; mask = gpu::ballot(lane_mask, !result)) {
+      uint32_t id = impl::lane_count(uniform & mask);
+      uint32_t index =
+          (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
+          usable_bits(chunk_size);
+
+      uint32_t slot = index / BITS_IN_WORD;
+      uint32_t bit = index % BITS_IN_WORD;
+      if (mask & (1ull << gpu::get_lane_id())) {
+        uint32_t before = __scoped_atomic_fetch_or(
+            &bitfield[slot], 1 << bit, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+        if (~before & (1 << bit)) {
+          result = ptr_from_index(index, chunk_size);
+        }
+      }
+    }
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    return result;
+  }
+
+  // Deallocates memory by resetting its corresponding bit in the bitfield.
+  void deallocate(void *ptr) {
+    uint32_t chunk_size = get_chunk_size();
+    uint32_t index = index_from_ptr(ptr, chunk_size);
+    uint32_t slot = index / BITS_IN_WORD;
+    uint32_t bit = index % BITS_IN_WORD;
+    uint32_t bitmask = 1 << bit;
+
+    uint32_t *bitfield = get_bitfield();
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    __scoped_atomic_fetch_and(&bitfield[slot], ~bitmask, __ATOMIC_RELAXED,
+                              __MEMORY_SCOPE_DEVICE);
+  }
+
+  // The actual memory the slab will manage. All offsets are calculated at
+  // runtime with the chunk size to keep the interface convergent when a warp or
+  // wavefront is handling multiple sizes at once.
+  uint8_t memory[SLAB_SIZE];
+};
+
+/// A wait-free guard around a pointer resource to be created dynamically if
+/// space is available and freed once there are no more users.
+template <typename T> struct GuardPtr {
+private:
+  struct RefCounter {
+    // Indicates that the object is in its deallocation phase and thus invalid.
+    static constexpr uint64_t invalid = 1ull << 63;
+
+    // If a read preempts an unlock call we indicate this so the following
+    // unlock call can swap out the helped bit and maintain exlusive ownership.
+    static constexpr uint64_t helped = 1ull << 62;
+
+    // Resets the reference counter, cannot be reset to zero safely.
+    void reset(uint32_t n, uint64_t &count) {
+      counter.store(n, cpp::MemoryOrder::RELAXED);
+      count = n;
+    }
+
+    // Acquire a slot in the reference counter if it is not invalid.
+    bool acquire(uint32_t n, uint64_t &count) {
+      count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
+      return (count & invalid) == 0;
+    }
+
+    // Release a slot in the reference counter. This function should only be
+    // called following a valid acquire call.
+    bool release(uint32_t n) {
+      // If this thread caused the counter to reach zero we try to invalidate it
+      // and obtain exclusive rights to descontruct it. If the CAS failed either
+      // another thread resurrced the counter and we quit, or a parallel read
+      // helped us invalidating it. For the latter, claim that flag and return.
+      if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
+        uint64_t expected = 0;
+        if (counter.compare_exchange_strong(expected, invalid,
+                                            cpp::MemoryOrder::RELAXED,
+                                            cpp::MemoryOrder::RELAXED))
+          return true;
+        else if ((expected & helped) &&
+                 (counter.exchange(invalid, cpp::MemoryOrder::RELAXED) &
+                  helped))
+          return true;
+      }
+      return false;
+    }
+
+    // Returns the current reference count, potentially helping a releasing
+    // thread.
+    uint64_t read() {
+      auto val = counter.load(cpp::MemoryOrder::RELAXED);
+      if (val == 0 && counter.compare_exchange_strong(
+                          val, invalid | helped, cpp::MemoryOrder::RELAXED))
+        return 0;
+      return (val & invalid) ? 0 : val;
+    }
+
+    cpp::Atomic<uint64_t> counter{0};
+  };
+
+  cpp::Atomic<T *> ptr{nullptr};
+  RefCounter ref{};
+
+  // A sentinel value used to claim the pointer slot.
+  static constexpr uint64_t sentinel = ~0ULL;
+
+  template <typename... Args>
+  T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+    T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
+    if (!expected &&
+        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(sentinel),
+                                    cpp::MemoryOrder::RELAXED,
+                                    cpp::MemoryOrder::RELAXED)) {
+
+      T *mem = reinterpret_cast<T *>(impl::rpc_allocate(sizeof(T)));
+      if (!mem)
+        return nullptr;
+      new (mem) T(cpp::forward<Args>(args)...);
+
+      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+      ptr.store(mem, cpp::MemoryOrder::RELAXED);
+      cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+      if (!ref.acquire(n, count))
+        ref.reset(n, count);
+      return mem;
+    }
+
+    if (!expected || expected == reinterpret_cast<T *>(sentinel))
+      return nullptr;
+
+    if (!ref.acquire(n, count))
+      return nullptr;
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    return ptr.load(cpp::MemoryOrder::RELAXED);
+  }
+
+public:
+  // Attempt to lock access to the pointer, potentially creating it if empty.
+  // The uniform mask represents which lanes share the same pointer. For each
+  // uniform value we elect a leader to handle it on behalf of the other lanes.
+  template <typename... Args>
+  T *try_lock(uint64_t lane_mask, uint64_t unifrom, uint64_t &count,
+              Args &&...args) {
+    count = 0;
+    T *result = nullptr;
+    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(unifrom)))
+      result = try_lock_impl(cpp::popcount(unifrom), count,
+                             cpp::forward<Args>(args)...);
+    result = gpu::shuffle(lane_mask, cpp::countr_zero(unifrom), result);
+
+    if (!result)
+      return nullptr;
+
+    // Obtain the value of the reference counter for each lane given the
+    // aggregate value.
+    count = gpu::shuffle(lane_mask, cpp::countr_zero(unifrom), count) -
+            cpp::popcount(unifrom) + impl::lane_count(unifrom) + 1;
+    return result;
+  }
+
+  // Release the associated lock on the pointer, potentially destroying it.
+  void unlock(uint64_t lane_mask, uint64_t mask) {
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
+        ref.release(cpp::popcount(mask))) {
+      T *p = ptr.load(cpp::MemoryOrder::RELAXED);
+      p->~T();
+      impl::rpc_free(p);
+      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+      ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
+    }
+    gpu::sync_lane(lane_mask);
+  }
+
+  // Get the current value of the reference counter.
+  uint64_t use_count() { return ref.read(); }
+};
+
+// The global array used to search for a valid slab to allocate from.
+static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
+
+// Tries to find a slab in the table that can support the given chunk size.
+static Slab *find_slab(uint32_t chunk_size) {
+  // We start at a hashed value to spread out different chunk sizes.
+  uint32_t start = impl::hash(chunk_size);
+  for (uint32_t offset = 0; offset < ARRAY_SIZE;) {
+    uint32_t index = (offset + start) % ARRAY_SIZE;
+
+    // If this slot is too full we exit early.
+    if (slots[index].use_count() >= Slab::available_chunks(chunk_size)) {
+      offset++;
+      sleep_briefly();
+      continue;
+    }
+
+    uint64_t lane_mask = gpu::get_lane_mask();
+    uint64_t uniform = gpu::match_any(lane_mask, index);
+    uint64_t reserved = 0;
+    Slab *slab =
+        slots[index].try_lock(lane_mask, uniform, reserved, chunk_size, index);
+    gpu::sync_lane(lane_mask);
+
+    // We successfully obtained a slab with enough space for our allocation.
+    // This guarantees that a call to Slab::allocate will always succeed.
+    if (slab && reserved <= Slab::available_chunks(chunk_size) &&
+        slab->get_chunk_size() == chunk_size)
+      return slab;
+
+    // We encountered either a full slab or an slab with an incompatible chunk
+    // size. Move to the next slot.
+    if (slab && reserved > Slab::available_chunks(chunk_size) &&
+        slab->get_chunk_size() == chunk_size) {
+      slots[index].unlock(gpu::get_lane_mask(), gpu::get_lane_mask() & uniform);
+      offset++;
+    }
+
+    // The slab is in the process of being initialized. Start at the beginning
+    // to prevent too many slab allocations from happening at once.
+    if (!slab && reserved == 0)
+      offset = 0;
+    sleep_briefly();
+  }
+  return nullptr;
+}
+
+// Release the lock associated with a given slab.
+static void release_slab(Slab *slab) {
+  uint32_t index = slab->get_global_index();
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, index);
+  slots[index].unlock(lane_mask, uniform);
+}
 
 namespace gpu {
 
-void *allocate(uint64_t size) { return rpc_allocate(size); }
+void *allocate(uint64_t size) {
+  if (!size)
+    return nullptr;
+
+  // Allocations larger than a single slab go directly to memory.
+  if (size >= SLAB_SIZE / 2)
+    return impl::rpc_allocate(size);
+
+  // Try to find a slab for the rounded up chunk size and allocate from it.
+  uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
+  Slab *slab = find_slab(chunk_size);
+  if (!slab)
+    return nullptr;
 
-void deallocate(void *ptr) { rpc_free(ptr); }
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
+  void *ptr = slab->allocate(lane_mask, uniform);
+  return ptr;
+}
+
+void deallocate(void *ptr) {
+  if (!ptr)
+    return;
+
+  // All non-slab allocations will be alinged on a 2MiB boundary.
+  if ((reinterpret_cast<uintptr_t>(ptr) & 0x1fffff) == 0)
+    return impl::rpc_free(ptr);
+
+  // The original slab pointer is the 2MiB boundary using the given pointer.
+  Slab *slab =
+      reinterpret_cast<Slab *>((reinterpret_cast<uintptr_t>(ptr) & ~0x1fffff));
+  slab->deallocate(ptr);
+  release_slab(slab);
+}
 
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt
index 1efdf607defe9..1773d9fc9f0f5 100644
--- a/libc/test/integration/src/stdlib/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/CMakeLists.txt
@@ -1,3 +1,6 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
 add_custom_target(stdlib-integration-tests)
 add_dependencies(libc-integration-tests stdlib-integration-tests)
 
diff --git a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..d0958e8271434
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_custom_target(stdlib-gpu-integration-tests)
+add_dependencies(libc-integration-tests stdlib-gpu-integration-tests)
+
+# TODO: Test on NVPTX, requires CUDA VMEM API.
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  add_integration_test(
+    malloc
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      malloc.cpp
+    DEPENDS
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 1024
+  )
+endif()
diff --git a/libc/test/integration/src/stdlib/gpu/malloc.cpp b/libc/test/integration/src/stdlib/gpu/malloc.cpp
new file mode 100644
index 0000000000000..a506a9b83ed2e
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/malloc.cpp
@@ -0,0 +1,31 @@
+//===-- Test for parallel GPU malloc interface ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+using namespace LIBC_NAMESPACE;
+
+TEST_MAIN(int, char **, char **) {
+  int *convergent = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(16));
+  EXPECT_NE(convergent, nullptr);
+  *convergent = 1;
+  EXPECT_EQ(*convergent, 1);
+  LIBC_NAMESPACE::free(convergent);
+
+  int *divergent = reinterpret_cast<int *>(
+      LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
+  EXPECT_NE(divergent, nullptr);
+  *divergent = 1;
+  EXPECT_EQ(*divergent, 1);
+  LIBC_NAMESPACE::free(divergent);
+  return 0;
+}

>From aade759da79427478763ac347ed98fd058207dae Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 15 May 2025 20:06:49 -0500
Subject: [PATCH 02/10] Address comments

---
 libc/src/__support/GPU/allocator.cpp | 40 +++++++++++++++++-----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index d26422ff79fc8..f937b4dd93655 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -57,12 +57,13 @@ static void rpc_free(void *ptr) {
 }
 
 // Convert a potentially disjoint bitmask into an increasing integer for use
-// with indexing.
+// with indexing between gpu lanes.
 static inline uint32_t lane_count(uint64_t lane_mask) {
   return cpp::popcount(lane_mask & ((1ull << gpu::get_lane_id()) - 1));
 }
 
-// Obtain an initial value to seed a random number generator.
+// Obtain an initial value to seed a random number generator. We use the rounded
+// multiples of the golden ratio from xorshift* as additional spreading.
 static inline uint32_t entropy() {
   return (static_cast<uint32_t>(gpu::processor_clock()) ^
           (gpu::get_thread_id_x() * 0x632be59b) ^
@@ -70,7 +71,7 @@ static inline uint32_t entropy() {
          0x9e3779bb;
 }
 
-// Generate a random number and update the state using the xorshift32 PRNG.
+// Generate a random number and update the state using the xorshift*32 PRNG.
 static inline uint32_t xorshift32(uint32_t &state) {
   state ^= state << 13;
   state ^= state >> 17;
@@ -114,13 +115,23 @@ static inline uint32_t get_chunk_size(uint32_t x) {
 /// A slab allocator used to hand out indentically sized slabs of memory.
 /// Allocation is done through random walks of a bitfield until a free bit is
 /// encountered. This reduces contention and is highly parallel on a GPU.
+///
+/// 0       4           8       16                 ...                     2 MiB
+/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
+/// │ chunk  │  index   │  pad   │    bitfield[]    │         memory[]         │
+/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
+///
+/// The size of the bitfield is the slab size divided by the chunk size divided
+/// by the number of bits per word. We pad the interface to ensure 16 byte
+/// alignment and to indicate that if the pointer is not aligned by 2MiB it
+/// belongs to a slab rather than the global allocator.
 struct Slab {
 
   // Initialize the slab with its chunk size and index in the global table for
   // use when freeing.
   Slab(uint32_t chunk_size, uint32_t global_index) {
-    get_chunk_size() = chunk_size;
-    get_global_index() = global_index;
+    *reinterpret_cast<uint32_t *>(&memory[0]) = chunk_size;
+    *reinterpret_cast<uint32_t *>(&memory[sizeof(uint32_t)]) = global_index;
 
     // This memset is expensive and likely not necessary for the current 'kfd'
     // driver. Until zeroed pages are exposed by the API we must be careful.
@@ -154,11 +165,13 @@ struct Slab {
   }
 
   // Get the location in the memory where we will store the chunk size.
-  uint32_t &get_chunk_size() { return *reinterpret_cast<uint32_t *>(memory); }
+  uint32_t get_chunk_size() const {
+    return *reinterpret_cast<const uint32_t *>(memory);
+  }
 
   // Get the location in the memory where we will store the global index.
-  uint32_t &get_global_index() {
-    return *reinterpret_cast<uint32_t *>(memory + sizeof(uint32_t));
+  uint32_t get_global_index() const {
+    return *reinterpret_cast<const uint32_t *>(memory + sizeof(uint32_t));
   }
 
   // Get a pointer to where the bitfield is located in the memory.
@@ -189,7 +202,6 @@ struct Slab {
   // caching and convergence.
   void *allocate(uint64_t lane_mask, uint64_t uniform) {
     uint32_t chunk_size = get_chunk_size();
-    uint32_t *bitfield = get_bitfield();
     uint32_t state = impl::entropy();
     void *result = nullptr;
     // The uniform mask represents which lanes contain a uniform target pointer.
@@ -205,8 +217,8 @@ struct Slab {
       uint32_t slot = index / BITS_IN_WORD;
       uint32_t bit = index % BITS_IN_WORD;
       if (mask & (1ull << gpu::get_lane_id())) {
-        uint32_t before = __scoped_atomic_fetch_or(
-            &bitfield[slot], 1 << bit, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+        uint32_t before = cpp::AtomicRef<uint32_t>(get_bitfield()[slot])
+                              .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
         if (~before & (1 << bit)) {
           result = ptr_from_index(index, chunk_size);
         }
@@ -223,12 +235,10 @@ struct Slab {
     uint32_t index = index_from_ptr(ptr, chunk_size);
     uint32_t slot = index / BITS_IN_WORD;
     uint32_t bit = index % BITS_IN_WORD;
-    uint32_t bitmask = 1 << bit;
 
-    uint32_t *bitfield = get_bitfield();
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-    __scoped_atomic_fetch_and(&bitfield[slot], ~bitmask, __ATOMIC_RELAXED,
-                              __MEMORY_SCOPE_DEVICE);
+    cpp::AtomicRef<uint32_t>(get_bitfield()[slot])
+        .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
   }
 
   // The actual memory the slab will manage. All offsets are calculated at

>From e275c3ba17b504ecda086afa031e6e6c82fcfb4c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 15 May 2025 22:44:23 -0500
Subject: [PATCH 03/10] Slightly better test coverage

---
 libc/test/integration/src/stdlib/gpu/malloc.cpp |  9 +++++++++
 libc/test/src/stdlib/malloc_test.cpp            | 12 ++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/libc/test/integration/src/stdlib/gpu/malloc.cpp b/libc/test/integration/src/stdlib/gpu/malloc.cpp
index a506a9b83ed2e..ab19d8d8efba9 100644
--- a/libc/test/integration/src/stdlib/gpu/malloc.cpp
+++ b/libc/test/integration/src/stdlib/gpu/malloc.cpp
@@ -27,5 +27,14 @@ TEST_MAIN(int, char **, char **) {
   *divergent = 1;
   EXPECT_EQ(*divergent, 1);
   LIBC_NAMESPACE::free(divergent);
+
+  if (gpu::get_lane_id() % 2) {
+    int *masked = reinterpret_cast<int *>(
+        LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
+    EXPECT_NE(masked, nullptr);
+    *masked = 1;
+    EXPECT_EQ(*masked, 1);
+    LIBC_NAMESPACE::free(masked);
+  }
   return 0;
 }
diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp
index d9023cf56d9fe..a8b32b7a430c9 100644
--- a/libc/test/src/stdlib/malloc_test.cpp
+++ b/libc/test/src/stdlib/malloc_test.cpp
@@ -17,3 +17,15 @@ TEST(LlvmLibcMallocTest, Allocate) {
   EXPECT_EQ(*ptr, 1);
   LIBC_NAMESPACE::free(ptr);
 }
+
+TEST(LlvmLibcMallocTest, Nullptr) {
+  int *ptr = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(0));
+  EXPECT_EQ(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  LIBC_NAMESPACE::free(ptr);
+}
+
+TEST(LlvmLibcMallocTest, LargeAllocation) {
+  int *ptr = reinterpret_cast<int *>(LIBC_NAMESPACE::malloc(2ul * 1024 * 1024));
+  EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  LIBC_NAMESPACE::free(ptr);
+}

>From f048d1378c4052c349f321c491f9895f2fd71a92 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 07:51:51 -0500
Subject: [PATCH 04/10] Matt comments

---
 libc/src/__support/GPU/allocator.cpp            | 4 ++--
 libc/test/integration/src/stdlib/gpu/malloc.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index f937b4dd93655..b53486036b3ac 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -103,9 +103,9 @@ static inline uint32_t get_chunk_size(uint32_t x) {
 
   if (s0 > y)
     return (s0 + 15) & ~15;
-  else if (s1 > y)
+  if (s1 > y)
     return (s1 + 15) & ~15;
-  else if (s2 > y)
+  if (s2 > y)
     return (s2 + 15) & ~15;
   return (s3 + 15) & ~15;
 }
diff --git a/libc/test/integration/src/stdlib/gpu/malloc.cpp b/libc/test/integration/src/stdlib/gpu/malloc.cpp
index ab19d8d8efba9..7880206b1aaaa 100644
--- a/libc/test/integration/src/stdlib/gpu/malloc.cpp
+++ b/libc/test/integration/src/stdlib/gpu/malloc.cpp
@@ -28,7 +28,7 @@ TEST_MAIN(int, char **, char **) {
   EXPECT_EQ(*divergent, 1);
   LIBC_NAMESPACE::free(divergent);
 
-  if (gpu::get_lane_id() % 2) {
+  if (gpu::get_lane_id() & 1) {
     int *masked = reinterpret_cast<int *>(
         LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
     EXPECT_NE(masked, nullptr);

>From 53e29880e47170a4de8e768c019d26d07f8a37dd Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 08:05:05 -0500
Subject: [PATCH 05/10] Fix OOM deadlock

---
 libc/src/__support/GPU/allocator.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index b53486036b3ac..7e653e470ffe2 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -311,6 +311,7 @@ template <typename T> struct GuardPtr {
   // A sentinel value used to claim the pointer slot.
   static constexpr uint64_t sentinel = ~0ULL;
 
+  // Should be called be a single lane for each different pointer.
   template <typename... Args>
   T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
     T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
@@ -318,7 +319,7 @@ template <typename T> struct GuardPtr {
         ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(sentinel),
                                     cpp::MemoryOrder::RELAXED,
                                     cpp::MemoryOrder::RELAXED)) {
-
+      count = ~0ull;
       T *mem = reinterpret_cast<T *>(impl::rpc_allocate(sizeof(T)));
       if (!mem)
         return nullptr;
@@ -422,6 +423,10 @@ static Slab *find_slab(uint32_t chunk_size) {
       offset++;
     }
 
+    // Malloc returned a null pointer and we are out-of-memory.
+    if (!slab && reserved == ~0ull)
+      return nullptr;
+
     // The slab is in the process of being initialized. Start at the beginning
     // to prevent too many slab allocations from happening at once.
     if (!slab && reserved == 0)

>From b791a7c14271745c5442f808bc7f8b7ee720e11f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 10:46:02 -0500
Subject: [PATCH 06/10] Some cleanups

---
 libc/src/__support/GPU/allocator.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 7e653e470ffe2..14abb1d9680d9 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -29,6 +29,7 @@ constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
 constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
 constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
 constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
+constexpr static uint32_t MIN_SIZE = 16;
 
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
@@ -91,9 +92,9 @@ static inline uint32_t hash(uint32_t x) {
 
 // Rounds the input value to the closest permitted chunk size. Here we accept
 // the sum of the closest three powers of two. For a 2MiB slab size this is 48
-// different chunk sizes.
+// different chunk sizes. This gives us average internal fragmentation of 87.5%.
 static inline uint32_t get_chunk_size(uint32_t x) {
-  uint32_t y = x < 16 ? 16 : x;
+  uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
   uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
 
   uint32_t s0 = 0b0100 << (pow2 - 3);
@@ -110,6 +111,13 @@ static inline uint32_t get_chunk_size(uint32_t x) {
   return (s3 + 15) & ~15;
 }
 
+// Rounds to the nearest power of two.
+template <uint32_t N, typename T>
+static inline constexpr T round_up(const T x) {
+  static_assert(((N - 1) & N) == 0, "N must be a power of two");
+  return (x + N) & ~(N - 1);
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out indentically sized slabs of memory.
@@ -126,7 +134,6 @@ static inline uint32_t get_chunk_size(uint32_t x) {
 /// alignment and to indicate that if the pointer is not aligned by 2MiB it
 /// belongs to a slab rather than the global allocator.
 struct Slab {
-
   // Initialize the slab with its chunk size and index in the global table for
   // use when freeing.
   Slab(uint32_t chunk_size, uint32_t global_index) {
@@ -151,7 +158,7 @@ struct Slab {
 
   // The actual amount of memory available excluding the bitfield and metadata.
   static uint32_t available_bytes(uint32_t chunk_size) {
-    return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - 4 * sizeof(uint32_t);
+    return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - MIN_SIZE;
   }
 
   // The number of chunks that can be stored in this slab.
@@ -176,13 +183,13 @@ struct Slab {
 
   // Get a pointer to where the bitfield is located in the memory.
   uint32_t *get_bitfield() {
-    return reinterpret_cast<uint32_t *>(memory + 4 * sizeof(uint32_t));
+    return reinterpret_cast<uint32_t *>(memory + MIN_SIZE);
   }
 
   // Get a pointer to where the actual memory to be allocated lives.
   uint8_t *get_memory(uint32_t chunk_size) {
     return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
-           4 * sizeof(uint32_t);
+           MIN_SIZE;
   }
 
   // Get a pointer to the actual memory given an index into the bitfield.
@@ -217,7 +224,7 @@ struct Slab {
       uint32_t slot = index / BITS_IN_WORD;
       uint32_t bit = index % BITS_IN_WORD;
       if (mask & (1ull << gpu::get_lane_id())) {
-        uint32_t before = cpp::AtomicRef<uint32_t>(get_bitfield()[slot])
+        uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
                               .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
         if (~before & (1 << bit)) {
           result = ptr_from_index(index, chunk_size);
@@ -237,7 +244,7 @@ struct Slab {
     uint32_t bit = index % BITS_IN_WORD;
 
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-    cpp::AtomicRef<uint32_t>(get_bitfield()[slot])
+    cpp::AtomicRef(get_bitfield()[slot])
         .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
   }
 
@@ -452,7 +459,7 @@ void *allocate(uint64_t size) {
 
   // Allocations larger than a single slab go directly to memory.
   if (size >= SLAB_SIZE / 2)
-    return impl::rpc_allocate(size);
+    return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));
 
   // Try to find a slab for the rounded up chunk size and allocate from it.
   uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));

>From 3df88df183930b56ceea356367a4d6f9c87c05d8 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 11:10:27 -0500
Subject: [PATCH 07/10] Add a more strenuous test, goes through 3 GiB of memory

---
 .../integration/src/stdlib/gpu/CMakeLists.txt | 14 +++++++
 .../src/stdlib/gpu/malloc_stress.cpp          | 38 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 libc/test/integration/src/stdlib/gpu/malloc_stress.cpp

diff --git a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
index d0958e8271434..26c877b1b6ae6 100644
--- a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@@ -16,4 +16,18 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       --threads 256
       --blocks 1024
   )
+
+  add_integration_test(
+    malloc_stress
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      malloc_stress.cpp
+    DEPENDS
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 2048
+  )
 endif()
diff --git a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
new file mode 100644
index 0000000000000..77479f85dc5cc
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
@@ -0,0 +1,38 @@
+//===-- Test for parallel GPU malloc interface ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+using namespace LIBC_NAMESPACE;
+
+static inline void use(uint8_t *ptr, uint32_t size) {
+  EXPECT_NE(ptr, nullptr);
+  for (int i = 0; i < size; ++i)
+    ptr[i] = uint8_t(i + gpu::get_thread_id());
+
+  // Try to detect if some other thread manages to clobber our memory.
+  for (int i = 0; i < size; ++i)
+    EXPECT_EQ(ptr[i], uint8_t(i + gpu::get_thread_id()));
+}
+
+TEST_MAIN(int, char **, char **) {
+  void *ptrs[256];
+  for (int i = 0; i < 256; ++i)
+    ptrs[i] = malloc(gpu::get_lane_id() % 2 ? 16 : 32);
+
+  for (int i = 0; i < 256; ++i)
+    use(reinterpret_cast<uint8_t *>(ptrs[i]), gpu::get_lane_id() % 2 ? 16 : 32);
+
+  for (int i = 0; i < 256; ++i)
+    free(ptrs[i]);
+  return 0;
+}

>From 109796ea999f4545215dfa9b530d74abe1544702 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 12:06:12 -0500
Subject: [PATCH 08/10] Improve comment

---
 libc/src/__support/GPU/allocator.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 14abb1d9680d9..317ab4236c95b 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -35,7 +35,9 @@ static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
 // Allocates more memory from the system through the RPC interface. All
-// allocations from the system MUST be aligned on a 2MiB barrier.
+// allocations from the system MUST be aligned on a 2MiB barrier. The default
+// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
+// driver provides an alignment field for virtual memory allocations.
 static void *rpc_allocate(uint64_t size) {
   void *ptr = nullptr;
   rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();

>From a7bf8ab15605e96d8444203cbdefa83794eded2b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 12:37:36 -0500
Subject: [PATCH 09/10] Michael comments

---
 libc/src/__support/GPU/allocator.cpp | 47 ++++++++++++++--------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 317ab4236c95b..1382bdc634b81 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -28,6 +28,7 @@ namespace LIBC_NAMESPACE_DECL {
 constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
 constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
 constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
+constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
 constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
 constexpr static uint32_t MIN_SIZE = 16;
 
@@ -62,7 +63,7 @@ static void rpc_free(void *ptr) {
 // Convert a potentially disjoint bitmask into an increasing integer for use
 // with indexing between gpu lanes.
 static inline uint32_t lane_count(uint64_t lane_mask) {
-  return cpp::popcount(lane_mask & ((1ull << gpu::get_lane_id()) - 1));
+  return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
 }
 
 // Obtain an initial value to seed a random number generator. We use the rounded
@@ -225,7 +226,7 @@ struct Slab {
 
       uint32_t slot = index / BITS_IN_WORD;
       uint32_t bit = index % BITS_IN_WORD;
-      if (mask & (1ull << gpu::get_lane_id())) {
+      if (mask & (uint64_t(1) << gpu::get_lane_id())) {
         uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
                               .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
         if (~before & (1 << bit)) {
@@ -262,11 +263,11 @@ template <typename T> struct GuardPtr {
 private:
   struct RefCounter {
     // Indicates that the object is in its deallocation phase and thus invalid.
-    static constexpr uint64_t invalid = 1ull << 63;
+    static constexpr uint64_t INVALID = uint64_t(1) << 63;
 
     // If a read preempts an unlock call we indicate this so the following
     // unlock call can swap out the helped bit and maintain exlusive ownership.
-    static constexpr uint64_t helped = 1ull << 62;
+    static constexpr uint64_t HELPED = uint64_t(1) << 62;
 
     // Resets the reference counter, cannot be reset to zero safely.
     void reset(uint32_t n, uint64_t &count) {
@@ -277,7 +278,7 @@ template <typename T> struct GuardPtr {
     // Acquire a slot in the reference counter if it is not invalid.
     bool acquire(uint32_t n, uint64_t &count) {
       count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
-      return (count & invalid) == 0;
+      return (count & INVALID) == 0;
     }
 
     // Release a slot in the reference counter. This function should only be
@@ -289,13 +290,13 @@ template <typename T> struct GuardPtr {
       // helped us invalidating it. For the latter, claim that flag and return.
       if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
         uint64_t expected = 0;
-        if (counter.compare_exchange_strong(expected, invalid,
+        if (counter.compare_exchange_strong(expected, INVALID,
                                             cpp::MemoryOrder::RELAXED,
                                             cpp::MemoryOrder::RELAXED))
           return true;
-        else if ((expected & helped) &&
-                 (counter.exchange(invalid, cpp::MemoryOrder::RELAXED) &
-                  helped))
+        else if ((expected & HELPED) &&
+                 (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
+                  HELPED))
           return true;
       }
       return false;
@@ -306,9 +307,9 @@ template <typename T> struct GuardPtr {
     uint64_t read() {
       auto val = counter.load(cpp::MemoryOrder::RELAXED);
       if (val == 0 && counter.compare_exchange_strong(
-                          val, invalid | helped, cpp::MemoryOrder::RELAXED))
+                          val, INVALID | HELPED, cpp::MemoryOrder::RELAXED))
         return 0;
-      return (val & invalid) ? 0 : val;
+      return (val & INVALID) ? 0 : val;
     }
 
     cpp::Atomic<uint64_t> counter{0};
@@ -318,7 +319,7 @@ template <typename T> struct GuardPtr {
   RefCounter ref{};
 
   // A sentinel value used to claim the pointer slot.
-  static constexpr uint64_t sentinel = ~0ULL;
+  static constexpr uint64_t sentinel = cpp::numeric_limits<uint64_t>::max();
 
   // Should be called be a single lane for each different pointer.
   template <typename... Args>
@@ -328,7 +329,7 @@ template <typename T> struct GuardPtr {
         ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(sentinel),
                                     cpp::MemoryOrder::RELAXED,
                                     cpp::MemoryOrder::RELAXED)) {
-      count = ~0ull;
+      count = cpp::numeric_limits<uint64_t>::max();
       T *mem = reinterpret_cast<T *>(impl::rpc_allocate(sizeof(T)));
       if (!mem)
         return nullptr;
@@ -357,22 +358,22 @@ template <typename T> struct GuardPtr {
   // The uniform mask represents which lanes share the same pointer. For each
   // uniform value we elect a leader to handle it on behalf of the other lanes.
   template <typename... Args>
-  T *try_lock(uint64_t lane_mask, uint64_t unifrom, uint64_t &count,
+  T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
               Args &&...args) {
     count = 0;
     T *result = nullptr;
-    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(unifrom)))
-      result = try_lock_impl(cpp::popcount(unifrom), count,
+    if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+      result = try_lock_impl(cpp::popcount(uniform), count,
                              cpp::forward<Args>(args)...);
-    result = gpu::shuffle(lane_mask, cpp::countr_zero(unifrom), result);
+    result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
 
     if (!result)
       return nullptr;
 
     // Obtain the value of the reference counter for each lane given the
     // aggregate value.
-    count = gpu::shuffle(lane_mask, cpp::countr_zero(unifrom), count) -
-            cpp::popcount(unifrom) + impl::lane_count(unifrom) + 1;
+    count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count) -
+            cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
     return result;
   }
 
@@ -433,7 +434,7 @@ static Slab *find_slab(uint32_t chunk_size) {
     }
 
     // Malloc returned a null pointer and we are out-of-memory.
-    if (!slab && reserved == ~0ull)
+    if (!slab && reserved == cpp::numeric_limits<uint64_t>::max())
       return nullptr;
 
     // The slab is in the process of being initialized. Start at the beginning
@@ -480,12 +481,12 @@ void deallocate(void *ptr) {
     return;
 
   // All non-slab allocations will be alinged on a 2MiB boundary.
-  if ((reinterpret_cast<uintptr_t>(ptr) & 0x1fffff) == 0)
+  if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0)
     return impl::rpc_free(ptr);
 
   // The original slab pointer is the 2MiB boundary using the given pointer.
-  Slab *slab =
-      reinterpret_cast<Slab *>((reinterpret_cast<uintptr_t>(ptr) & ~0x1fffff));
+  Slab *slab = reinterpret_cast<Slab *>(
+      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT));
   slab->deallocate(ptr);
   release_slab(slab);
 }

>From 1ce74c106f03ac18f42cfff9dfbfcbcfc13aa788 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 16 May 2025 20:47:31 -0500
Subject: [PATCH 10/10] Comments

---
 libc/src/__support/GPU/allocator.cpp | 40 +++++++++++++++-------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 1382bdc634b81..b3d99cf134c74 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -137,11 +137,18 @@ static inline constexpr T round_up(const T x) {
 /// alignment and to indicate that if the pointer is not aligned by 2MiB it
 /// belongs to a slab rather than the global allocator.
 struct Slab {
+  // Header metadata for the slab, aligned to the minimum alignment.
+  struct alignas(MIN_SIZE) Header {
+    uint32_t chunk_size;
+    uint32_t global_index;
+  };
+
   // Initialize the slab with its chunk size and index in the global table for
   // use when freeing.
   Slab(uint32_t chunk_size, uint32_t global_index) {
-    *reinterpret_cast<uint32_t *>(&memory[0]) = chunk_size;
-    *reinterpret_cast<uint32_t *>(&memory[sizeof(uint32_t)]) = global_index;
+    Header *header = reinterpret_cast<Header *>(memory);
+    header->chunk_size = chunk_size;
+    header->global_index = global_index;
 
     // This memset is expensive and likely not necessary for the current 'kfd'
     // driver. Until zeroed pages are exposed by the API we must be careful.
@@ -155,13 +162,12 @@ struct Slab {
 
   // Get the number of bytes needed to contain the bitfield bits.
   static uint32_t bitfield_bytes(uint32_t chunk_size) {
-    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) *
-           sizeof(uint32_t);
+    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
   }
 
   // The actual amount of memory available excluding the bitfield and metadata.
   static uint32_t available_bytes(uint32_t chunk_size) {
-    return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - MIN_SIZE;
+    return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
   }
 
   // The number of chunks that can be stored in this slab.
@@ -171,7 +177,7 @@ struct Slab {
 
   // The length in bits of the bitfield.
   static uint32_t usable_bits(uint32_t chunk_size) {
-    return ((available_bytes(chunk_size) + chunk_size - 1) / chunk_size);
+    return available_bytes(chunk_size) / chunk_size;
   }
 
   // Get the location in the memory where we will store the chunk size.
@@ -186,13 +192,13 @@ struct Slab {
 
   // Get a pointer to where the bitfield is located in the memory.
   uint32_t *get_bitfield() {
-    return reinterpret_cast<uint32_t *>(memory + MIN_SIZE);
+    return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
   }
 
   // Get a pointer to where the actual memory to be allocated lives.
   uint8_t *get_memory(uint32_t chunk_size) {
     return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
-           MIN_SIZE;
+           sizeof(Header);
   }
 
   // Get a pointer to the actual memory given an index into the bitfield.
@@ -207,15 +213,14 @@ struct Slab {
            chunk_size;
   }
 
-  // Randomly walks the bitfield until it finds a free bit in the bitfield.
-  // Allocations attempt to put lanes right next to eachother for better
-  // caching and convergence.
+  // Randomly walks the bitfield until it finds a free bit. Allocations attempt
+  // to put lanes right next to each other for better caching and convergence.
   void *allocate(uint64_t lane_mask, uint64_t uniform) {
     uint32_t chunk_size = get_chunk_size();
     uint32_t state = impl::entropy();
     void *result = nullptr;
     // The uniform mask represents which lanes contain a uniform target pointer.
-    // We attempt to place these next to eachother in the bitfield.
+    // We attempt to place these next to each other.
     // TODO: We should coalesce these bits and use the result of `fetch_or` to
     //       search for free bits in parallel.
     for (uint64_t mask = ~0ull; mask; mask = gpu::ballot(lane_mask, !result)) {
@@ -229,9 +234,8 @@ struct Slab {
       if (mask & (uint64_t(1) << gpu::get_lane_id())) {
         uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
                               .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
-        if (~before & (1 << bit)) {
+        if (~before & (1 << bit))
           result = ptr_from_index(index, chunk_size);
-        }
       }
     }
 
@@ -319,14 +323,14 @@ template <typename T> struct GuardPtr {
   RefCounter ref{};
 
   // A sentinel value used to claim the pointer slot.
-  static constexpr uint64_t sentinel = cpp::numeric_limits<uint64_t>::max();
+  static constexpr uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
 
   // Should be called be a single lane for each different pointer.
   template <typename... Args>
   T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
     T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
     if (!expected &&
-        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(sentinel),
+        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
                                     cpp::MemoryOrder::RELAXED,
                                     cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint64_t>::max();
@@ -343,7 +347,7 @@ template <typename T> struct GuardPtr {
       return mem;
     }
 
-    if (!expected || expected == reinterpret_cast<T *>(sentinel))
+    if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
       return nullptr;
 
     if (!ref.acquire(n, count))
@@ -460,7 +464,7 @@ void *allocate(uint64_t size) {
   if (!size)
     return nullptr;
 
-  // Allocations larger than a single slab go directly to memory.
+  // Allocations requiring a full slab or more go directly to memory.
   if (size >= SLAB_SIZE / 2)
     return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size));