[libc-commits] [libc] [libc] Implement efficient 'malloc' on the GPU (PR #140156)

Fri May 16 07:41:11 PDT 2025

================
@@ -27,21 +47,439 @@ void *rpc_allocate(uint64_t size) {
   return ptr;
 }
 
-void rpc_free(void *ptr) {
+// Deallocates the associated system memory.
+static void rpc_free(void *ptr) {
   rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
   port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
   });
   port.close();
 }
 
-} // namespace
+// Convert a potentially disjoint bitmask into an increasing integer for use
+// with indexing between gpu lanes.
+static inline uint32_t lane_count(uint64_t lane_mask) {
+  return cpp::popcount(lane_mask & ((1ull << gpu::get_lane_id()) - 1));
+}
+
+// Obtain an initial value to seed a random number generator. We use the rounded
+// multiples of the golden ratio from xorshift* as additional spreading.
+static inline uint32_t entropy() {
+  return (static_cast<uint32_t>(gpu::processor_clock()) ^
+          (gpu::get_thread_id_x() * 0x632be59b) ^
+          (gpu::get_block_id_x() * 0x85157af5)) *
+         0x9e3779bb;
+}
+
+// Generate a random number and update the state using the xorshift*32 PRNG.
+static inline uint32_t xorshift32(uint32_t &state) {
+  state ^= state << 13;
+  state ^= state >> 17;
+  state ^= state << 5;
+  return state * 0x9e3779bb;
+}
+
+// Final stage of murmurhash used to get a unique index for the global array
+static inline uint32_t hash(uint32_t x) {
+  x ^= x >> 16;
+  x *= 0x85ebca6b;
+  x ^= x >> 13;
+  x *= 0xc2b2ae35;
+  x ^= x >> 16;
+  return x;
+}
+
+// Rounds the input value to the closest permitted chunk size. Here we accept
+// the sum of the closest three powers of two. For a 2MiB slab size this is 48
+// different chunk sizes.
+static inline uint32_t get_chunk_size(uint32_t x) {
+  uint32_t y = x < 16 ? 16 : x;
+  uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
+
+  uint32_t s0 = 0b0100 << (pow2 - 3);
+  uint32_t s1 = 0b0110 << (pow2 - 3);
+  uint32_t s2 = 0b0111 << (pow2 - 3);
+  uint32_t s3 = 0b1000 << (pow2 - 3);
+
+  if (s0 > y)
+    return (s0 + 15) & ~15;
+  if (s1 > y)
+    return (s1 + 15) & ~15;
+  if (s2 > y)
+    return (s2 + 15) & ~15;
+  return (s3 + 15) & ~15;
+}
+
+} // namespace impl
+
+/// A slab allocator used to hand out indentically sized slabs of memory.
+/// Allocation is done through random walks of a bitfield until a free bit is
+/// encountered. This reduces contention and is highly parallel on a GPU.
+///
+/// 0       4           8       16                 ...                     2 MiB
+/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
+/// │ chunk  │  index   │  pad   │    bitfield[]    │         memory[]         │
+/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
+///
+/// The size of the bitfield is the slab size divided by the chunk size divided
+/// by the number of bits per word. We pad the interface to ensure 16 byte
+/// alignment and to indicate that if the pointer is not aligned by 2MiB it
+/// belongs to a slab rather than the global allocator.
+struct Slab {
+
+  // Initialize the slab with its chunk size and index in the global table for
+  // use when freeing.
+  Slab(uint32_t chunk_size, uint32_t global_index) {
+    *reinterpret_cast<uint32_t *>(&memory[0]) = chunk_size;
+    *reinterpret_cast<uint32_t *>(&memory[sizeof(uint32_t)]) = global_index;
+
+    // This memset is expensive and likely not necessary for the current 'kfd'
+    // driver. Until zeroed pages are exposed by the API we must be careful.
+    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  }
+
+  // Get the number of chunks that can theoretically fit inside this array.
+  static uint32_t num_chunks(uint32_t chunk_size) {
+    return SLAB_SIZE / chunk_size;
+  }
+
+  // Get the number of bytes needed to contain the bitfield bits.
+  static uint32_t bitfield_bytes(uint32_t chunk_size) {
+    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) *
+           sizeof(uint32_t);
+  }
+
+  // The actual amount of memory available excluding the bitfield and metadata.
+  static uint32_t available_bytes(uint32_t chunk_size) {
+    return SLAB_SIZE - 2 * bitfield_bytes(chunk_size) - 4 * sizeof(uint32_t);
+  }
+
+  // The number of chunks that can be stored in this slab.
+  static uint32_t available_chunks(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // The length in bits of the bitfield.
+  static uint32_t usable_bits(uint32_t chunk_size) {
+    return ((available_bytes(chunk_size) + chunk_size - 1) / chunk_size);
+  }
+
+  // Get the location in the memory where we will store the chunk size.
+  uint32_t get_chunk_size() const {
+    return *reinterpret_cast<const uint32_t *>(memory);
+  }
+
+  // Get the location in the memory where we will store the global index.
+  uint32_t get_global_index() const {
+    return *reinterpret_cast<const uint32_t *>(memory + sizeof(uint32_t));
+  }
+
+  // Get a pointer to where the bitfield is located in the memory.
+  uint32_t *get_bitfield() {
+    return reinterpret_cast<uint32_t *>(memory + 4 * sizeof(uint32_t));
+  }
+
+  // Get a pointer to where the actual memory to be allocated lives.
+  uint8_t *get_memory(uint32_t chunk_size) {
+    return reinterpret_cast<uint8_t *>(memory) + bitfield_bytes(chunk_size) +
+           4 * sizeof(uint32_t);
+  }
+
+  // Get a pointer to the actual memory given an index into the bitfield.
+  void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
+    return get_memory(chunk_size) + index * chunk_size;
+  }
+
+  // Convert a pointer back into its bitfield index using its offset.
+  uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
+    return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
+                                 get_memory(chunk_size)) /
+           chunk_size;
+  }
+
+  // Randomly walks the bitfield until it finds a free bit in the bitfield.
+  // Allocations attempt to put lanes right next to eachother for better
+  // caching and convergence.
+  void *allocate(uint64_t lane_mask, uint64_t uniform) {
+    uint32_t chunk_size = get_chunk_size();
+    uint32_t state = impl::entropy();
+    void *result = nullptr;
+    // The uniform mask represents which lanes contain a uniform target pointer.
+    // We attempt to place these next to eachother in the bitfield.
+    // TODO: We should coalesce these bits and use the result of `fetch_or` to
+    //       search for free bits in parallel.
+    for (uint64_t mask = ~0ull; mask; mask = gpu::ballot(lane_mask, !result)) {
+      uint32_t id = impl::lane_count(uniform & mask);
+      uint32_t index =
+          (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
+          usable_bits(chunk_size);
+
+      uint32_t slot = index / BITS_IN_WORD;
+      uint32_t bit = index % BITS_IN_WORD;
+      if (mask & (1ull << gpu::get_lane_id())) {
+        uint32_t before = cpp::AtomicRef<uint32_t>(get_bitfield()[slot])
+                              .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
+        if (~before & (1 << bit)) {
+          result = ptr_from_index(index, chunk_size);
+        }
+      }
+    }
+
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
----------------
jhuber6 wrote:

I set device scope to be the default in the LLVM libc implementation. Probably not 'correct' if this were something we exported, but it's way more convenient for the types of things `libc` cares about.

https://github.com/llvm/llvm-project/pull/140156