[libc-commits] [libc] 2bef46d - [libc] Add a loader utility for NVPTX architectures for testing
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Fri Mar 24 18:04:51 PDT 2023
Author: Joseph Huber
Date: 2023-03-24T20:04:42-05:00
New Revision: 2bef46d2ad872794c83a49f1da12b1b20835f75d
URL: https://github.com/llvm/llvm-project/commit/2bef46d2ad872794c83a49f1da12b1b20835f75d
DIFF: https://github.com/llvm/llvm-project/commit/2bef46d2ad872794c83a49f1da12b1b20835f75d.diff
LOG: [libc] Add a loader utility for NVPTX architectures for testing
This patch adds a loader utility targeting the CUDA driver API to launch
NVPTX images called `nvptx_loader`. This takes a GPU image on the
command line and launches the `_start` kernel with the appropriate
arguments. The `_start` kernel is provided by the already implemented
`nvptx/start.cpp`. So, an application with a `main` function can be
compiled and run as follows.
```
clang++ --target=nvptx64-nvidia-cuda main.cpp crt1.o -march=sm_70 -o image
./nvptx_loader image args to kernel
```
This implementation is not tested and does not yet support RPC. This
requires further development to work around NVIDIA specific limitations
in atomics and linking.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D146681
Added:
libc/utils/gpu/loader/nvptx/CMakeLists.txt
libc/utils/gpu/loader/nvptx/Loader.cpp
Modified:
libc/utils/gpu/loader/CMakeLists.txt
libc/utils/gpu/loader/Loader.h
libc/utils/gpu/loader/amdgpu/Loader.cpp
Removed:
################################################################################
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index f643bfcd9abbd..5e9f0a1a49559 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -8,6 +8,13 @@ else()
message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
endif()
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+ add_subdirectory(nvptx)
+else()
+ message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+endif()
+
# Add a custom target to be used for testing.
if(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
add_custom_target(libc.utils.gpu.loader)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index aecd6db25b54a..dad96c10d37c8 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -6,9 +6,46 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+
+#include <cstring>
#include <stddef.h>
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
int load(int argc, char **argv, char **evnp, void *image, size_t size);
+
+/// Copy the system's argument vector to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
+ void *dev_argv = alloc(argc * sizeof(char *));
+ if (dev_argv == nullptr)
+ return nullptr;
+
+ for (int i = 0; i < argc; ++i) {
+ size_t size = strlen(argv[i]) + 1;
+ void *dev_str = alloc(size);
+ if (dev_str == nullptr)
+ return nullptr;
+
+ // Load the host memory buffer with the pointer values of the newly
+ // allocated strings.
+ std::memcpy(dev_str, argv[i], size);
+ static_cast<void **>(dev_argv)[i] = dev_str;
+ }
+ return dev_argv;
+};
+
+/// Copy the system's environment to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_environment(char **envp, Allocator alloc) {
+ int envc = 0;
+ for (char **env = envp; *env != 0; ++env)
+ ++envc;
+
+ return copy_argument_vector(envc, envp, alloc);
+};
+
+#endif
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index fcb5119a0f87e..d090b98c5a2ea 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -75,6 +75,11 @@ static void handle_error(hsa_status_t code) {
exit(EXIT_FAILURE);
}
+static void handle_error(const char *msg) {
+ fprintf(stderr, "%s\n", msg);
+ exit(EXIT_FAILURE);
+}
+
/// Generic interface for iterating using the HSA callbacks.
template <typename elem_ty, typename func_ty, typename callback_ty>
hsa_status_t iterate(func_ty func, callback_ty cb) {
@@ -279,50 +284,23 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
// Allocate fine-grained memory on the host to hold the pointer array for the
// copied argv and allow the GPU agent to access it.
- void *dev_argv;
- if (hsa_status_t err =
- hsa_amd_memory_pool_allocate(finegrained_pool, argc * sizeof(char *),
- /*flags=*/0, &dev_argv))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_argv);
-
- // Copy each string in the argument vector to global memory on the device.
- for (int i = 0; i < argc; ++i) {
- size_t size = strlen(argv[i]) + 1;
- void *dev_str;
+ auto allocator = [&](uint64_t size) -> void * {
+ void *dev_ptr = nullptr;
if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
- /*flags=*/0, &dev_str))
+ /*flags=*/0, &dev_ptr))
handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str);
- // Load the host memory buffer with the pointer values of the newly
- // allocated strings.
- std::memcpy(dev_str, argv[i], size);
- static_cast<void **>(dev_argv)[i] = dev_str;
- }
+ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+ return dev_ptr;
+ };
+ void *dev_argv = copy_argument_vector(argc, argv, allocator);
+ if (!dev_argv)
+ handle_error("Failed to allocate device argv");
// Allocate fine-grained memory on the host to hold the pointer array for the
// copied environment array and allow the GPU agent to access it.
- int envc = 0;
- for (char **env = envp; *env != 0; ++env)
- ++envc;
- void *dev_envp;
- if (hsa_status_t err =
- hsa_amd_memory_pool_allocate(finegrained_pool, envc * sizeof(char *),
- /*flags=*/0, &dev_envp))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_envp);
- for (int i = 0; i < envc; ++i) {
- size_t size = strlen(envp[i]) + 1;
- void *dev_str;
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
- /*flags=*/0, &dev_str))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str);
- // Load the host memory buffer with the pointer values of the newly
- // allocated strings.
- std::memcpy(dev_str, envp[i], size);
- static_cast<void **>(dev_envp)[i] = dev_str;
- }
+ void *dev_envp = copy_environment(envp, allocator);
+ if (!dev_envp)
+ handle_error("Failed to allocate device environment");
// Allocate space for the return pointer and initialize it to zero.
void *dev_ret;
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..2cad2774ce3d4
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(nvptx_loader Loader.cpp)
+add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
+
+target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR})
+target_link_libraries(nvptx_loader
+ PRIVATE
+ gpu_loader
+ CUDA::cuda_driver
+)
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
new file mode 100644
index 0000000000000..55531fdc588eb
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -0,0 +1,147 @@
+//===-- Loader Implementation for NVPTX devices --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file impelements a simple loader to run images supporting the NVPTX
+// architecture. The file launches the '_start' kernel which should be provided
+// by the device application start code and call ultimately call the 'main'
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Loader.h"
+
+#include "cuda.h"
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+/// The arguments to the '_start' kernel.
+struct kernel_args_t {
+ int argc;
+ void *argv;
+ void *envp;
+ void *ret;
+ void *inbox;
+ void *outbox;
+ void *buffer;
+};
+
+static void handle_error(CUresult err) {
+ if (err == CUDA_SUCCESS)
+ return;
+
+ const char *err_str = nullptr;
+ CUresult result = cuGetErrorString(err, &err_str);
+ if (result != CUDA_SUCCESS)
+ fprintf(stderr, "Unknown Error\n");
+ else
+ fprintf(stderr, "%s\n", err_str);
+ exit(1);
+}
+
+static void handle_error(const char *msg) {
+ fprintf(stderr, "%s\n", msg);
+ exit(EXIT_FAILURE);
+}
+
+int load(int argc, char **argv, char **envp, void *image, size_t size) {
+ if (CUresult err = cuInit(0))
+ handle_error(err);
+
+ // Obtain the first device found on the system.
+ CUdevice device;
+ if (CUresult err = cuDeviceGet(&device, 0))
+ handle_error(err);
+
+ // Initialize the CUDA context and claim it for this execution.
+ CUcontext context;
+ if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+ handle_error(err);
+ if (CUresult err = cuCtxSetCurrent(context))
+ handle_error(err);
+
+ // Initialize a non-blocking CUDA stream to execute the kernel.
+ CUstream stream;
+ if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
+ handle_error(err);
+
+ // Load the image into a CUDA module.
+ CUmodule binary;
+ if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+ handle_error(err);
+
+ // look up the '_start' kernel in the loaded module.
+ CUfunction function;
+ if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
+ handle_error(err);
+
+ // Allocate pinned memory on the host to hold the pointer array for the
+ // copied argv and allow the GPU device to access it.
+ auto allocator = [&](uint64_t size) -> void * {
+ void *dev_ptr;
+ if (CUresult err = cuMemAllocHost(&dev_ptr, size))
+ handle_error(err);
+ return dev_ptr;
+ };
+ void *dev_argv = copy_argument_vector(argc, argv, allocator);
+ if (!dev_argv)
+ handle_error("Failed to allocate device argv");
+
+ // Allocate pinned memory on the host to hold the pointer array for the
+ // copied environment array and allow the GPU device to access it.
+ void *dev_envp = copy_environment(envp, allocator);
+ if (!dev_envp)
+ handle_error("Failed to allocate device environment");
+
+ // Allocate space for the return pointer and initialize it to zero.
+ CUdeviceptr dev_ret;
+ if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
+ handle_error(err);
+ if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
+ handle_error(err);
+
+ // Set up the arguments to the '_start' kernel on the GPU.
+ // TODO: Setup RPC server implementation;
+ uint64_t args_size = sizeof(kernel_args_t);
+ kernel_args_t args;
+ std::memset(&args, 0, args_size);
+ args.argc = argc;
+ args.argv = dev_argv;
+ args.envp = dev_envp;
+ args.ret = reinterpret_cast<void *>(dev_ret);
+ void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+ CU_LAUNCH_PARAM_END};
+
+ // Call the kernel with the given arguments.
+ if (CUresult err =
+ cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
+ /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
+ /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+ handle_error(err);
+
+ // TODO: Query the RPC server periodically while the kernel is running.
+ while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
+ ;
+
+ // Copy the return value back from the kernel and wait.
+ int host_ret = 0;
+ if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
+ handle_error(err);
+
+ if (CUresult err = cuStreamSynchronize(stream))
+ handle_error(err);
+
+ // Destroy the context and the loaded binary.
+ if (CUresult err = cuModuleUnload(binary))
+ handle_error(err);
+ if (CUresult err = cuDevicePrimaryCtxRelease(device))
+ handle_error(err);
+ return host_ret;
+}
More information about the libc-commits
mailing list