[libc-commits] [libc] [libc] Implement efficient 'malloc' on the GPU (PR #140156)

Mon May 19 11:31:17 PDT 2025

================
@@ -27,21 +54,459 @@ void *rpc_allocate(uint64_t size) {
   return ptr;
 }
 
-void rpc_free(void *ptr) {
+// Deallocates the associated system memory.
+static void rpc_free(void *ptr) {
   rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
   port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
   });
   port.close();
 }
 
-} // namespace
+// Convert a potentially disjoint bitmask into an increasing integer for use
+// with indexing between gpu lanes.
+static inline uint32_t lane_count(uint64_t lane_mask) {
+  return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
+}
+
+// Obtain an initial value to seed a random number generator. We use the rounded
+// multiples of the golden ratio from xorshift* as additional spreading.
+static inline uint32_t entropy() {
+  return (static_cast<uint32_t>(gpu::processor_clock()) ^
+          (gpu::get_thread_id_x() * 0x632be59b) ^
+          (gpu::get_block_id_x() * 0x85157af5)) *
+         0x9e3779bb;
+}
+
+// Generate a random number and update the state using the xorshift*32 PRNG.
+static inline uint32_t xorshift32(uint32_t &state) {
+  state ^= state << 13;
+  state ^= state >> 17;
+  state ^= state << 5;
+  return state * 0x9e3779bb;
+}
+
+// Final stage of murmurhash used to get a unique index for the global array
+static inline uint32_t hash(uint32_t x) {
+  x ^= x >> 16;
+  x *= 0x85ebca6b;
+  x ^= x >> 13;
+  x *= 0xc2b2ae35;
+  x ^= x >> 16;
+  return x;
+}
+
+// Rounds the input value to the closest permitted chunk size. Here we accept
+// the sum of the closest three powers of two. For a 2MiB slab size this is 48
+// different chunk sizes. This gives us average internal fragmentation of 87.5%.
+static inline uint32_t get_chunk_size(uint32_t x) {
+  uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
+  uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
+
+  uint32_t s0 = 0b0100 << (pow2 - 3);
+  uint32_t s1 = 0b0110 << (pow2 - 3);
+  uint32_t s2 = 0b0111 << (pow2 - 3);
+  uint32_t s3 = 0b1000 << (pow2 - 3);
+
+  if (s0 > y)
+    return (s0 + 15) & ~15;
+  if (s1 > y)
+    return (s1 + 15) & ~15;
+  if (s2 > y)
+    return (s2 + 15) & ~15;
+  return (s3 + 15) & ~15;
+}
+
+// Rounds to the nearest power of two.
+template <uint32_t N, typename T>
+static inline constexpr T round_up(const T x) {
+  static_assert(((N - 1) & N) == 0, "N must be a power of two");
+  return (x + N) & ~(N - 1);
+}
+
+} // namespace impl
+
+/// A slab allocator used to hand out indentically sized slabs of memory.
+/// Allocation is done through random walks of a bitfield until a free bit is
+/// encountered. This reduces contention and is highly parallel on a GPU.
+///
+/// 0       4           8       16                 ...                     2 MiB
+/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
+/// │ chunk  │  index   │  pad   │    bitfield[]    │         memory[]         │
+/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
+///
+/// The size of the bitfield is the slab size divided by the chunk size divided
+/// by the number of bits per word. We pad the interface to ensure 16 byte
+/// alignment and to indicate that if the pointer is not aligned by 2MiB it
+/// belongs to a slab rather than the global allocator.
+struct Slab {
+  // Header metadata for the slab, aligned to the minimum alignment.
+  struct alignas(MIN_SIZE) Header {
+    uint32_t chunk_size;
+    uint32_t global_index;
+  };
+
+  // Initialize the slab with its chunk size and index in the global table for
+  // use when freeing.
+  Slab(uint32_t chunk_size, uint32_t global_index) {
+    Header *header = reinterpret_cast<Header *>(memory);
+    header->chunk_size = chunk_size;
+    header->global_index = global_index;
+
+    // This memset is expensive and likely not necessary for the current 'kfd'
+    // driver. Until zeroed pages are exposed by the API we must be careful.
+    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  }
+
+  // Get the number of chunks that can theoretically fit inside this array.
+  static uint32_t num_chunks(uint32_t chunk_size) {
+    return SLAB_SIZE / chunk_size;
+  }
+
+  // Get the number of bytes needed to contain the bitfield bits.
+  static uint32_t bitfield_bytes(uint32_t chunk_size) {
+    return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
+  }
+
+  // The actual amount of memory available excluding the bitfield and metadata.
+  static uint32_t available_bytes(uint32_t chunk_size) {
+    return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
+  }
+
+  // The number of chunks that can be stored in this slab.
+  static uint32_t available_chunks(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // The length in bits of the bitfield.
+  static uint32_t usable_bits(uint32_t chunk_size) {
+    return available_bytes(chunk_size) / chunk_size;
+  }
+
+  // Get the location in the memory where we will store the chunk size.
+  uint32_t get_chunk_size() const {
----------------
mysterymath wrote:

Adding `Header* header()` (or `get_header()` if you like) should simplifying/deleting some code:

- The constructor contents would just read `header()->chunk_size =` and `header()->global_index =`.
- `get_chunk_size()` and `get_global_index()` could be removed entirely; their call sites would just be `header()->chunk_size` and `header()->global_index`

https://github.com/llvm/llvm-project/pull/140156