[libc-commits] [libc] [libc] Efficiently implement `aligned_alloc` for AMDGPU (PR #146585)

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Wed Jul 2 06:42:39 PDT 2025


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/146585

>From a1bbe313f97a7f49a761d07f16b99d0e10822e2d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 1 Jul 2025 13:02:45 -0500
Subject: [PATCH 1/4] [libc] Efficiently implement `aligned_alloc` for AMDGPU

Summary:
This patch uses the actual allocator interface to implement
`aligned_alloc`. We do this by simply rounding up the amount allocated.
Because of how index calculation works, any offset within an allocated
pointer will still map to the same chunk, so we can just adjust
internally and it will free all the same.
---
 libc/src/__support/GPU/allocator.cpp          | 28 ++++++++++++++++++
 libc/src/__support/GPU/allocator.h            |  1 +
 libc/src/stdlib/gpu/aligned_alloc.cpp         | 18 ++++++------
 .../integration/src/stdlib/gpu/CMakeLists.txt | 15 ++++++++++
 .../src/stdlib/gpu/aligned_alloc.cpp          | 29 +++++++++++++++++++
 5 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index b2f2953e4f285..fb2b73d6ea0ac 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -138,6 +138,11 @@ void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
     s[i] = c;
 }
 
+// Indicates that the provided value is a power of two.
+static inline constexpr bool is_pow2(uint64_t x) {
+  return x && (x & (x - 1)) == 0;
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -572,5 +577,28 @@ void *reallocate(void *ptr, uint64_t size) {
   return new_ptr;
 }
 
+void *aligned_allocate(uint64_t alignment, uint64_t size) {
+  // All alignment values must be a non-zero power of two.
+  if (!impl::is_pow2(alignment))
+    return nullptr;
+
+  // If the requested alignment is less than what we already provide this is
+  // just a normal allocation.
+  if (alignment < MIN_ALIGNMENT + 1)
+    return gpu::allocate(size);
+
+  // We can't handle alignments greater than 2MiB so we simply fail.
+  if (alignment > SLAB_ALIGNMENT + 1)
+    return nullptr;
+
+  // Trying to handle allocation internally would break the assumption that each
+  // chunk is identical to eachother. Allocate enough memory with worst-case
+  // alignment and then round up. The index logic will round down properly.
+  uint64_t rounded = size + alignment - 1;
+  void *ptr = gpu::allocate(rounded);
+  return reinterpret_cast<void *>(
+      (reinterpret_cast<uintptr_t>(ptr) + alignment - 1) & ~(alignment - 1));
+}
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/GPU/allocator.h b/libc/src/__support/GPU/allocator.h
index 757f3a406015b..65bc8dfb11889 100644
--- a/libc/src/__support/GPU/allocator.h
+++ b/libc/src/__support/GPU/allocator.h
@@ -18,6 +18,7 @@ namespace gpu {
 void *allocate(uint64_t size);
 void deallocate(void *ptr);
 void *reallocate(void *ptr, uint64_t size);
+void *aligned_allocate(uint64_t alignment, uint64_t size);
 
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/aligned_alloc.cpp b/libc/src/stdlib/gpu/aligned_alloc.cpp
index cd2c7e55128fe..bdab001ee0be3 100644
--- a/libc/src/stdlib/gpu/aligned_alloc.cpp
+++ b/libc/src/stdlib/gpu/aligned_alloc.cpp
@@ -15,15 +15,15 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
-  if ((alignment & -alignment) != alignment)
-    return nullptr;
-
-  void *ptr = gpu::allocate(size);
-  if ((reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) != 0) {
-    gpu::deallocate(ptr);
-    return nullptr;
-  }
-  return ptr;
+  // FIXME: NVIDIA targets currently use the built-in 'malloc' which we cannot
+  // reason with. But we still need to provide this function for compatibility.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+  return gpu::aligned_allocate(alignment, size);
+#else
+  (void)alignment;
+  (void)size;
+  return nullptr;
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
index 69e1909fe78ed..5f9a215bb8738 100644
--- a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@@ -32,6 +32,21 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       --blocks 1024
   )
 
+  add_integration_test(
+    aligned_alloc
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      aligned_alloc.cpp
+    DEPENDS
+      libc.src.stdlib.aligned_alloc
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 128
+  )
+
   add_integration_test(
     malloc_stress
     SUITE
diff --git a/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp b/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
new file mode 100644
index 0000000000000..4fd2992ff0026
--- /dev/null
+++ b/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
@@ -0,0 +1,29 @@
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/aligned_alloc.h" // Adjust path if needed
+#include "src/stdlib/free.h"
+
+using namespace LIBC_NAMESPACE;
+
+TEST_MAIN(int, char **, char **) {
+  // aligned_alloc with valid alignment and size
+  void *ptr = LIBC_NAMESPACE::aligned_alloc(32, 16);
+  EXPECT_NE(ptr, nullptr);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(ptr) & (32 - 1), 0U);
+
+  LIBC_NAMESPACE::free(ptr);
+
+  // aligned_alloc fails if alignment is not power of two
+  void *bad_align = LIBC_NAMESPACE::aligned_alloc(30, 99);
+  EXPECT_EQ(bad_align, nullptr);
+
+  // aligned_alloc with a divergent size.
+  size_t alignment = 1 << (__gpu_lane_id() % 8 + 1);
+  void *div =
+      LIBC_NAMESPACE::aligned_alloc(alignment, (gpu::get_thread_id() + 1) * 4);
+  EXPECT_NE(div, nullptr);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(div) & (alignment - 1), 0U);
+
+  return 0;
+}

>From e341f42bde44094b566a43af48ac0fafb064000d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 1 Jul 2025 20:30:43 -0500
Subject: [PATCH 2/4] Builtin

---
 libc/src/__support/GPU/allocator.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index fb2b73d6ea0ac..a575737abcf00 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -596,8 +596,7 @@ void *aligned_allocate(uint64_t alignment, uint64_t size) {
   // alignment and then round up. The index logic will round down properly.
   uint64_t rounded = size + alignment - 1;
   void *ptr = gpu::allocate(rounded);
-  return reinterpret_cast<void *>(
-      (reinterpret_cast<uintptr_t>(ptr) + alignment - 1) & ~(alignment - 1));
+  return __builtin_align_up(ptr, alignment);
 }
 
 } // namespace gpu

>From 9ed3947a7d84c20d532b356a83cf8d36bfce368a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 2 Jul 2025 08:39:32 -0500
Subject: [PATCH 3/4] uint32_t

---
 libc/src/__support/GPU/allocator.cpp  | 2 +-
 libc/src/__support/GPU/allocator.h    | 2 +-
 libc/src/stdlib/gpu/aligned_alloc.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index a575737abcf00..628a93c47cc93 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -577,7 +577,7 @@ void *reallocate(void *ptr, uint64_t size) {
   return new_ptr;
 }
 
-void *aligned_allocate(uint64_t alignment, uint64_t size) {
+void *aligned_allocate(uint32_t alignment, uint64_t size) {
   // All alignment values must be a non-zero power of two.
   if (!impl::is_pow2(alignment))
     return nullptr;
diff --git a/libc/src/__support/GPU/allocator.h b/libc/src/__support/GPU/allocator.h
index 65bc8dfb11889..a7cf8bceef272 100644
--- a/libc/src/__support/GPU/allocator.h
+++ b/libc/src/__support/GPU/allocator.h
@@ -18,7 +18,7 @@ namespace gpu {
 void *allocate(uint64_t size);
 void deallocate(void *ptr);
 void *reallocate(void *ptr, uint64_t size);
-void *aligned_allocate(uint64_t alignment, uint64_t size);
+void *aligned_allocate(uint32_t alignment, uint64_t size);
 
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/aligned_alloc.cpp b/libc/src/stdlib/gpu/aligned_alloc.cpp
index bdab001ee0be3..34a7eae618fec 100644
--- a/libc/src/stdlib/gpu/aligned_alloc.cpp
+++ b/libc/src/stdlib/gpu/aligned_alloc.cpp
@@ -18,7 +18,7 @@ LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
   // FIXME: NVIDIA targets currently use the built-in 'malloc' which we cannot
   // reason with. But we still need to provide this function for compatibility.
 #ifndef LIBC_TARGET_ARCH_IS_NVPTX
-  return gpu::aligned_allocate(alignment, size);
+  return gpu::aligned_allocate(static_cast<uint32_t>(alignment), size);
 #else
   (void)alignment;
   (void)size;

>From 716472d5f704b25801049aa807d032ba9405eaa3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 2 Jul 2025 08:42:28 -0500
Subject: [PATCH 4/4] Builtin

---
 libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp b/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
index 4fd2992ff0026..b966e6953cc25 100644
--- a/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
+++ b/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
@@ -10,7 +10,7 @@ TEST_MAIN(int, char **, char **) {
   // aligned_alloc with valid alignment and size
   void *ptr = LIBC_NAMESPACE::aligned_alloc(32, 16);
   EXPECT_NE(ptr, nullptr);
-  EXPECT_EQ(reinterpret_cast<uintptr_t>(ptr) & (32 - 1), 0U);
+  EXPECT_EQ(__builtin_is_aligned(ptr, 32), 0U);
 
   LIBC_NAMESPACE::free(ptr);
 
@@ -23,7 +23,7 @@ TEST_MAIN(int, char **, char **) {
   void *div =
       LIBC_NAMESPACE::aligned_alloc(alignment, (gpu::get_thread_id() + 1) * 4);
   EXPECT_NE(div, nullptr);
-  EXPECT_EQ(reinterpret_cast<uintptr_t>(div) & (alignment - 1), 0U);
+  EXPECT_EQ(__builtin_is_aligned(div, alignment), 0U);
 
   return 0;
 }



More information about the libc-commits mailing list