[libc-commits] [libc] a621308 - [libc] Implement basic `malloc` and `free` support on the GPU

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Mon Jun 5 15:57:04 PDT 2023


Author: Joseph Huber
Date: 2023-06-05T17:56:53-05:00
New Revision: a6213088812f813ce8527139f4aa9c03f40c7c1d

URL: https://github.com/llvm/llvm-project/commit/a6213088812f813ce8527139f4aa9c03f40c7c1d
DIFF: https://github.com/llvm/llvm-project/commit/a6213088812f813ce8527139f4aa9c03f40c7c1d.diff

LOG: [libc] Implement basic `malloc` and `free` support on the GPU

This patch adds support for the `malloc` and `free` functions. These
currently aren't implemented in-tree so we first add the interface
filies.

This patch provides the most basic support for a true `malloc` and
`free` by using the RPC interface. This is functional, but in the future
we will want to implement a more intelligent system and primarily use
the RPC interface more as a `brk()` or `sbrk()` interface only called
when absolutely necessary. We will need to design an intelligent
allocator in the future.

The semantics of these memory allocations will need to be checked. I am
somewhat iffy on the details. I've heard that HSA can allocate
asynchronously which seems to work with my tests at least. CUDA uses an
implicit synchronization scheme so we need to use an explicitly separate
stream from the one launching the kernel or the default stream. I will
need to test the NVPTX case.

I would appreciate if anyone more experienced with the implementation details
here could chime in for the HSA and CUDA cases.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D151735

Added: 
    libc/src/stdlib/free.h
    libc/src/stdlib/gpu/CMakeLists.txt
    libc/src/stdlib/gpu/free.cpp
    libc/src/stdlib/gpu/malloc.cpp
    libc/src/stdlib/malloc.h
    libc/test/src/stdlib/malloc_test.cpp

Modified: 
    libc/src/__support/RPC/rpc.h
    libc/src/stdlib/CMakeLists.txt
    libc/test/src/stdlib/CMakeLists.txt
    libc/utils/gpu/loader/CMakeLists.txt
    libc/utils/gpu/loader/Server.h
    libc/utils/gpu/loader/amdgpu/Loader.cpp
    libc/utils/gpu/loader/nvptx/Loader.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index de08ec063915c..e2b043c98e884 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -35,9 +35,11 @@ enum Opcode : uint16_t {
   NOOP = 0,
   PRINT_TO_STDERR = 1,
   EXIT = 2,
-  TEST_INCREMENT = 3,
-  TEST_INTERFACE = 4,
-  TEST_STREAM = 5,
+  MALLOC = 3,
+  FREE = 4,
+  TEST_INCREMENT = 5,
+  TEST_INTERFACE = 6,
+  TEST_STREAM = 7,
 };
 
 /// A fixed size channel used to communicate between the RPC client and server.

diff  --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index bffa45f20932a..1a55db1c70edc 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -287,10 +287,23 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
     DEPENDS
       ${SCUDO_DEPS}
   )
+elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_external(
+    calloc
+  )
+  add_entrypoint_external(
+    realloc
+  )
+  add_entrypoint_external(
+    aligned_alloc
+  )
 else()
   add_entrypoint_external(
     malloc
   )
+  add_entrypoint_external(
+    free
+  )
   add_entrypoint_external(
     calloc
   )
@@ -300,9 +313,6 @@ else()
   add_entrypoint_external(
     aligned_alloc
   )
-  add_entrypoint_external(
-    free
-  )
 endif()
 
 if(NOT LLVM_LIBC_FULL_BUILD)
@@ -356,3 +366,19 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.abort
 )
+
+if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_object(
+    malloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.malloc
+  )
+
+  add_entrypoint_object(
+    free
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.free
+  )
+endif()

diff  --git a/libc/src/stdlib/free.h b/libc/src/stdlib/free.h
new file mode 100644
index 0000000000000..81299376f9735
--- /dev/null
+++ b/libc/src/stdlib/free.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for free --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_FREE_H
+#define LLVM_LIBC_SRC_STDLIB_FREE_H
+
+namespace __llvm_libc {
+
+void free(void *ptr);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H

diff  --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..6a5d57623fd41
--- /dev/null
+++ b/libc/src/stdlib/gpu/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_entrypoint_object(
+  malloc
+  SRCS
+    malloc.cpp
+  HDRS
+    ../malloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)
+
+add_entrypoint_object(
+  free
+  SRCS
+    free.cpp
+  HDRS
+    ../free.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)

diff  --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp
new file mode 100644
index 0000000000000..6889349d52b98
--- /dev/null
+++ b/libc/src/stdlib/gpu/free.cpp
@@ -0,0 +1,23 @@
+//===-- GPU Implementation of free ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void, free, (void *ptr)) {
+  rpc::Client::Port port = rpc::client.open<rpc::FREE>();
+  port.send([=](rpc::Buffer *buffer) {
+    buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
+  });
+  port.close();
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp
new file mode 100644
index 0000000000000..e9c37b85e709c
--- /dev/null
+++ b/libc/src/stdlib/gpu/malloc.cpp
@@ -0,0 +1,26 @@
+//===-- GPU Implementation of malloc --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/malloc.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
+  void *ptr = nullptr;
+  rpc::Client::Port port = rpc::client.open<rpc::MALLOC>();
+  port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
+                     [&](rpc::Buffer *buffer) {
+                       ptr = reinterpret_cast<void *>(buffer->data[0]);
+                     });
+  port.close();
+  return ptr;
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/stdlib/malloc.h b/libc/src/stdlib/malloc.h
new file mode 100644
index 0000000000000..62e4ca0bce0ac
--- /dev/null
+++ b/libc/src/stdlib/malloc.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for malloc ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_MALLOC_H
+#define LLVM_LIBC_SRC_STDLIB_MALLOC_H
+
+namespace __llvm_libc {
+
+void *malloc(size_t size);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H

diff  --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 8a2107367ad7f..071ffd244d1fa 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -320,4 +320,19 @@ if(LLVM_LIBC_FULL_BUILD)
       libc.src.signal.raise
   )
 
+  # Only the GPU has an in-tree 'malloc' implementation.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    add_libc_test(
+      malloc_test
+      HERMETIC_TEST_ONLY
+      SUITE
+        libc-stdlib-tests
+      SRCS
+        malloc_test.cpp
+      DEPENDS
+        libc.include.stdlib
+        libc.src.stdlib.malloc
+        libc.src.stdlib.free
+    )
+  endif()
 endif()

diff  --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp
new file mode 100644
index 0000000000000..579d5dca53f00
--- /dev/null
+++ b/libc/test/src/stdlib/malloc_test.cpp
@@ -0,0 +1,19 @@
+//===-- Unittests for malloc ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMallocTest, Allocate) {
+  int *ptr = reinterpret_cast<int *>(__llvm_libc::malloc(sizeof(int)));
+  EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  *ptr = 1;
+  EXPECT_EQ(*ptr, 1);
+  __llvm_libc::free(ptr);
+}

diff  --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index 689cf086b4763..dc9ee6fe3523f 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -14,10 +14,17 @@ endif()
 find_package(CUDAToolkit QUIET)
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
 find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND)
+if(CUDAToolkit_FOUND AND LLVM_FOUND AND
+   ${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.2")
   add_subdirectory(nvptx)
 else()
-  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  if(${CUDAToolkit_VERSION} VERSION_LESS "11.2")
+    message(WARNING 
+      "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
+       Found CUDA Version ${CUDAToolkit_VERSION}")
+  else()
+    message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  endif()
 endif()
 
 # Add a custom target to be used for testing.

diff  --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
index f4e39afa94121..a8dffb67a7868 100644
--- a/libc/utils/gpu/loader/Server.h
+++ b/libc/utils/gpu/loader/Server.h
@@ -21,7 +21,8 @@ static __llvm_libc::rpc::Server server;
 
 /// Queries the RPC client at least once and performs server-side work if there
 /// are any active requests.
-void handle_server() {
+template <typename Alloc, typename Dealloc>
+void handle_server(Alloc allocator, Dealloc deallocator) {
   using namespace __llvm_libc;
 
   // Continue servicing the client until there is no work left and we return.
@@ -50,6 +51,19 @@ void handle_server() {
       });
       break;
     }
+    case rpc::Opcode::MALLOC: {
+      port->recv_and_send([&](rpc::Buffer *buffer) {
+        buffer->data[0] =
+            reinterpret_cast<uintptr_t>(allocator(buffer->data[0]));
+      });
+      break;
+    }
+    case rpc::Opcode::FREE: {
+      port->recv([&](rpc::Buffer *buffer) {
+        deallocator(reinterpret_cast<void *>(buffer->data[0]));
+      });
+      break;
+    }
     case rpc::Opcode::TEST_INCREMENT: {
       port->recv_and_send([](rpc::Buffer *buffer) {
         reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;

diff  --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index a98b557b877c4..3d1f95a05b3c1 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -134,6 +134,7 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
 template <typename args_t>
 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
                            hsa_amd_memory_pool_t kernargs_pool,
+                           hsa_amd_memory_pool_t coarsegrained_pool,
                            hsa_queue_t *queue, const LaunchParameters &params,
                            const char *kernel_name, args_t kernel_args) {
   // Look up the '_start' kernel in the loaded executable.
@@ -142,6 +143,21 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
           executable, kernel_name, &dev_agent, &symbol))
     return err;
 
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr = nullptr;
+    if (hsa_status_t err =
+            hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                         /*flags=*/0, &dev_ptr))
+      handle_error(err);
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+    return dev_ptr;
+  };
+
+  auto deallocator = [](void *ptr) -> void {
+    if (hsa_status_t err = hsa_amd_memory_pool_free(ptr))
+      handle_error(err);
+  };
+
   // Retrieve 
diff erent properties of the kernel symbol used for launch.
   uint64_t kernel;
   uint32_t args_size;
@@ -219,11 +235,11 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   while (hsa_signal_wait_scacquire(
              packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
@@ -366,14 +382,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_begin.kd", init_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_begin.kd", init_args))
     handle_error(err);
 
   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
-  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       queue, params, "_start.kd", args))
+  if (hsa_status_t err =
+          launch_kernel(dev_agent, executable, kernargs_pool,
+                        coarsegrained_pool, queue, params, "_start.kd", args))
     handle_error(err);
 
   // Create a memory signal and copy the return value back from the device into
@@ -402,9 +419,9 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   int ret = *static_cast<int *>(host_ret);
 
   end_args_t fini_args = {ret};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_end.kd", fini_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_end.kd", fini_args))
     handle_error(err);
 
   // Free the memory allocated for the device.

diff  --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 7879deea65a0a..142a1bb9b3618 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -174,6 +174,29 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
 
+  // Initialize a non-blocking CUDA stream to allocate memory if needed. This
+  // needs to be done on a separate stream or else it will deadlock with the
+  // executing kernel.
+  CUstream memory_stream;
+  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  auto allocator = [&](uint64_t size) -> void * {
+    CUdeviceptr dev_ptr;
+    if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
+      handle_error(err);
+
+    // Wait until the memory allocation is complete.
+    while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
+      ;
+    return reinterpret_cast<void *>(dev_ptr);
+  };
+  auto deallocator = [&](void *ptr) -> void {
+    if (CUresult err =
+            cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), memory_stream))
+      handle_error(err);
+  };
+
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
           function, params.num_blocks_x, params.num_blocks_y,
@@ -184,11 +207,11 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   // Wait until the kernel has completed execution on the device. Periodically
   // check the RPC client for work to be performed on the server.
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   return CUDA_SUCCESS;
 }


        


More information about the libc-commits mailing list