[libc-commits] [libc] 2e1c0ec - [libc] Support global constructors and destructors on NVPTX

Thu May 4 05:13:11 PDT 2023

Author: Joseph Huber
Date: 2023-05-04T07:13:00-05:00
New Revision: 2e1c0ec6297958f73ca5ed35ce47803ea0f48dba

URL: https://github.com/llvm/llvm-project/commit/2e1c0ec6297958f73ca5ed35ce47803ea0f48dba
DIFF: https://github.com/llvm/llvm-project/commit/2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.diff

LOG: [libc] Support global constructors and destructors on NVPTX

This patch adds the necessary hacks to support global constructors and
destructors. This is an incredibly hacky process caused by the primary
fact that Nvidia does not provide any binary tools and very little
linker support. We first had to emit references to these functions and
their priority in D149451. Then we dig them out of the module once it's
loaded to manually create the list that the linker should have made for
us. This patch also contains a few Nvidia specific hacks, but it passes
the test, albeit with a stack size warning from `ptxas` for the
callback. But this should be fine given the resource usage of a common
test.

This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't
cause problems with our CUDA buildbot.

Depends on D149451

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D149527

Added: 
    

Modified: 
    libc/cmake/modules/LLVMLibCTestRules.cmake
    libc/startup/gpu/nvptx/CMakeLists.txt
    libc/startup/gpu/nvptx/start.cpp
    libc/test/IntegrationTest/test.cpp
    libc/test/integration/startup/gpu/CMakeLists.txt
    libc/test/integration/startup/gpu/init_fini_array_test.cpp
    libc/utils/gpu/loader/CMakeLists.txt
    libc/utils/gpu/loader/nvptx/CMakeLists.txt
    libc/utils/gpu/loader/nvptx/Loader.cpp

Removed: 
    


################################################################################
diff  --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index b9c05a1d0c8d6..a3a34136604de 100644

--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -497,12 +497,12 @@ function(add_integration_test test_name)
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_compile_options(${fq_build_target_name} PRIVATE
-                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-                           --target=${LIBC_GPU_TARGET_TRIPLE})
+                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
+                           -flto --target=${LIBC_GPU_TARGET_TRIPLE})
   elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
     get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
     target_compile_options(${fq_build_target_name} PRIVATE
-                           ${nvptx_options}
+                           ${nvptx_options} -fno-use-cxa-atexit
                            --target=${LIBC_GPU_TARGET_TRIPLE})
   endif()
 

diff  --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index b8a9f49d5be53..49fa489c6129b 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -6,6 +6,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff  --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 7b88e30f7f370..fe09666a33de7 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -8,6 +8,8 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
@@ -15,21 +17,79 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" {
+// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
+// to manually create them and update the globals in the loader implememtation.
+uintptr_t *__init_array_start [[gnu::visibility("protected")]];
+uintptr_t *__init_array_end [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
+}
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+// TODO: Put this in a separate kernel and call it with one thread.
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    // FIXME: The function pointer escaping this TU causes warnings.
+    __llvm_libc::atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+// TODO: Put this in a separate kernel and call it with one thread.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
@@ -37,7 +97,9 @@ void init_rpc(void *in, void *out, void *buffer) {
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }

diff  --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp
index 4d2a7f08cc06e..e86e0a8d22c82 100644
--- a/libc/test/IntegrationTest/test.cpp
+++ b/libc/test/IntegrationTest/test.cpp
@@ -22,6 +22,7 @@ int memcmp(const void *lhs, const void *rhs, size_t count);
 void *memcpy(void *__restrict, const void *__restrict, size_t);
 void *memmove(void *dst, const void *src, size_t count);
 void *memset(void *ptr, int value, size_t count);
+int atexit(void (*func)(void));
 
 } // namespace __llvm_libc
 
@@ -44,6 +45,9 @@ void *memset(void *ptr, int value, size_t count) {
   return __llvm_libc::memset(ptr, value, count);
 }
 
+// This is needed if the test was compiled with '-fno-use-cxa-atexit'.
+int atexit(void (*func)(void)) { return __llvm_libc::atexit(func); }
+
 } // extern "C"
 
 // Integration tests cannot use the SCUDO standalone allocator as SCUDO pulls

diff  --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
index ab3f4c39fe48b..754f36d8789cc 100644
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -26,12 +26,9 @@ add_integration_test(
     --threads 1
 )
 
-# Constructors are currently only supported on AMDGPU.
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_integration_test(
-    init_fini_array_test
-    SUITE libc-startup-tests
-    SRCS
-      init_fini_array_test.cpp
-  )
-endif()
+add_integration_test(
+  init_fini_array_test
+  SUITE libc-startup-tests
+  SRCS
+    init_fini_array_test.cpp
+)

diff  --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
index 23064e1e85aa7..1e61711f0fc4d 100644
--- a/libc/test/integration/startup/gpu/init_fini_array_test.cpp
+++ b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
@@ -53,7 +53,7 @@ __attribute__((destructor(1))) void reset_initval() {
   initval = 0;
 }
 
-TEST_MAIN() {
+TEST_MAIN(int argc, char **argv, char **env) {
   ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
   ASSERT_EQ(initval, INITVAL_INITIALIZER);
   return 0;

diff  --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index 3f63ef0bc90e4..689cf086b4763 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -12,7 +12,9 @@ else()
 endif()
 
 find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
+# The CUDA loader requires LLVM to traverse the ELF image for symbols.
+find_package(LLVM QUIET)
+if(CUDAToolkit_FOUND AND LLVM_FOUND)
   add_subdirectory(nvptx)
 else()
   message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")

diff  --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index f88914383a98e..9e85357920678 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,8 +1,14 @@
 add_executable(nvptx_loader Loader.cpp)
 add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
 
+if(NOT LLVM_ENABLE_RTTI)
+  target_compile_options(nvptx_loader PRIVATE -fno-rtti)
+endif()
+target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
 target_link_libraries(nvptx_loader
   PRIVATE
   gpu_loader
   CUDA::cuda_driver
+  LLVMObject
+  LLVMSupport
 )

diff  --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index eb8db7f48572e..baf8baaff7cda 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -17,10 +17,18 @@
 #include "Server.h"
 
 #include "cuda.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <vector>
+
+using namespace llvm;
+using namespace object;
 
 /// The arguments to the '_start' kernel.
 struct kernel_args_t {
@@ -51,11 +59,122 @@ static void handle_error(const char *msg) {
   exit(EXIT_FAILURE);
 }
 
+// Gets the names of all the globals that contain functions to initialize or
+// deinitialize. We need to do this manually because the NVPTX toolchain does
+// not contain the necessary binary manipulation tools.
+template <typename Alloc>
+Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
+                                     Alloc allocator, CUmodule binary) {
+  auto mem_buffer = MemoryBuffer::getMemBuffer(
+      StringRef(reinterpret_cast<const char *>(image), size), "image",
+      /*RequiresNullTerminator=*/false);
+  Expected<ELF64LEObjectFile> elf_or_err =
+      ELF64LEObjectFile::create(*mem_buffer);
+  if (!elf_or_err)
+    handle_error(toString(elf_or_err.takeError()).c_str());
+
+  std::vector<std::pair<const char *, uint16_t>> ctors;
+  std::vector<std::pair<const char *, uint16_t>> dtors;
+  // CUDA has no way to iterate over all the symbols so we need to inspect the
+  // ELF directly using the LLVM libraries.
+  for (const auto &symbol : elf_or_err->symbols()) {
+    auto name_or_err = symbol.getName();
+    if (!name_or_err)
+      handle_error(toString(name_or_err.takeError()).c_str());
+
+    // Search for all symbols that contain a constructor or destructor.
+    if (!name_or_err->starts_with("__init_array_object_") &&
+        !name_or_err->starts_with("__fini_array_object_"))
+      continue;
+
+    uint16_t priority;
+    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
+      handle_error("Invalid priority for constructor or destructor");
+
+    if (name_or_err->starts_with("__init"))
+      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
+    else
+      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
+  }
+  // Lower priority constructors are run before higher ones. The reverse is true
+  // for destructors.
+  llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::reverse(dtors);
+
+  // Allocate host pinned memory to make these arrays visible to the GPU.
+  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
+      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
+  uint64_t global_size = 0;
+
+  // Get the address of the global and then store the address of the constructor
+  // function to call in the constructor array.
+  CUdeviceptr *dev_ctors_start = dev_memory;
+  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
+  for (uint64_t i = 0; i < ctors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Get the address of the global and then store the address of the destructor
+  // function to call in the destructor array.
+  CUdeviceptr *dev_dtors_start = dev_ctors_end;
+  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
+  for (uint64_t i = 0; i < dtors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Obtain the address of the pointers the startup implementation uses to
+  // iterate the constructors and destructors.
+  CUdeviceptr init_start;
+  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
+                                       "__init_array_start"))
+    handle_error(err);
+  CUdeviceptr init_end;
+  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
+                                       "__init_array_end"))
+    handle_error(err);
+  CUdeviceptr fini_start;
+  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
+                                       "__fini_array_start"))
+    handle_error(err);
+  CUdeviceptr fini_end;
+  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
+                                       "__fini_array_end"))
+    handle_error(err);
+
+  // Copy the pointers to the newly written array to the symbols so the startup
+  // implementation can iterate them.
+  if (CUresult err =
+          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
+    handle_error(err);
+
+  return dev_memory;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
+
   if (CUresult err = cuInit(0))
     handle_error(err);
-
   // Obtain the first device found on the system.
   CUdevice device;
   if (CUresult err = cuDeviceGet(&device, 0))
@@ -91,6 +210,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
       handle_error(err);
     return dev_ptr;
   };
+
+  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
+  if (!memory_or_err)
+    handle_error(toString(memory_or_err.takeError()).c_str());
+
   void *dev_argv = copy_argument_vector(argc, argv, allocator);
   if (!dev_argv)
     handle_error("Failed to allocate device argv");
@@ -153,6 +277,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   // Free the memory allocated for the device.
+  if (CUresult err = cuMemFreeHost(*memory_or_err))
+    handle_error(err);
   if (CUresult err = cuMemFree(dev_ret))
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))