[libc] [llvm] [libc] Make GPU `_end` kernel only call exit callbacks (PR #162371)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 7 13:50:51 PDT 2025
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/162371
Summary:
We use the infrastructure to stand up a pretend hosted environment on
the GPU. Part of that is calling exit codes and handling the callback.
Exiting from inside a GPU region is problematic as it actually relies on
a lot of GPU magic behind the scenes. This is at least *correct* now as
we use `quick_exit` on the CPU when the GPU calls `exit`. However,
calling `quick_exit` will interfere with instrumentation or benchmarking
that excepts a nice teardown order. For normal execution we should do
the friendly option and let the loader utility clean everything up
manually.
>From cdc1453363531fde395b7f74b5135e1faaef3c34 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 7 Oct 2025 15:24:55 -0500
Subject: [PATCH] [libc] Make GPU `_end` kernel only call exit callbacks
Summary:
We use the infrastructure to stand up a pretend hosted environment on
the GPU. Part of that is calling exit codes and handling the callback.
Exiting from inside a GPU region is problematic as it actually relies on
a lot of GPU magic behind the scenes. This is at least *correct* now as
we use `quick_exit` on the CPU when the GPU calls `exit`. However,
calling `quick_exit` will interfere with instrumentation or benchmarking
that excepts a nice teardown order. For normal execution we should do
the friendly option and let the loader utility clean everything up
manually.
---
libc/startup/gpu/amdgpu/start.cpp | 10 +++++-----
libc/startup/gpu/nvptx/start.cpp | 9 +++++----
llvm/tools/llvm-gpu-loader/amdhsa.cpp | 4 ++--
llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h | 4 +---
llvm/tools/llvm-gpu-loader/nvptx.cpp | 4 ++--
5 files changed, 15 insertions(+), 16 deletions(-)
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 8bd0c3a938d02..bf6c96ea7c617 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -14,6 +14,7 @@
#include "src/stdlib/exit.h"
extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
namespace LIBC_NAMESPACE_DECL {
@@ -68,9 +69,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
clang::amdgpu_flat_work_group_size(1, 1),
clang::amdgpu_max_num_work_groups(1)]] void
-_end(int retval) {
- // Only a single thread should call `exit` here, the rest should gracefully
- // return from the kernel. This is so only one thread calls the destructors
- // registred with 'atexit' above.
- LIBC_NAMESPACE::exit(retval);
+_end() {
+ // Only a single thread should call the destructors registred with 'atexit'.
+ // The loader utility will handle the actual exit and return code cleanly.
+ __cxa_finalize(nullptr);
}
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index bc529b36f5097..f6e0c849b842c 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -14,6 +14,7 @@
#include "src/stdlib/exit.h"
extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
namespace LIBC_NAMESPACE_DECL {
@@ -71,8 +72,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
}
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_end(int retval) {
- // To finis the execution we invoke all the callbacks registered via 'atexit'
- // and then exit with the appropriate return value.
- LIBC_NAMESPACE::exit(retval);
+_end() {
+ // Only a single thread should call the destructors registred with 'atexit'.
+ // The loader utility will handle the actual exit and return code cleanly.
+ __cxa_finalize(nullptr);
}
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
index be1b6b7993920..5715058d8cfac 100644
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -192,7 +192,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Initialize all the arguments (explicit and implicit) to zero, then set the
// explicit arguments to the values created above.
std::memset(args, 0, args_size);
- std::memcpy(args, &kernel_args, sizeof(args_t));
+ std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
// Initialize the necessary implicit arguments to the proper values.
int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
@@ -563,7 +563,7 @@ int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
// Save the return value and perform basic clean-up.
int ret = *static_cast<int *>(host_ret);
- end_args_t fini_args = {ret};
+ end_args_t fini_args = {};
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
server, single_threaded_params, "_end.kd", fini_args,
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index ed34d0bace978..08861c29b4fa4 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -41,9 +41,7 @@ struct start_args_t {
};
/// The arguments to the '_end' kernel.
-struct end_args_t {
- int argc;
-};
+struct end_args_t {};
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
index 781a045e71a94..82b455249ad24 100644
--- a/llvm/tools/llvm-gpu-loader/nvptx.cpp
+++ b/llvm/tools/llvm-gpu-loader/nvptx.cpp
@@ -177,7 +177,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
handle_error(err);
// Set up the arguments to the '_start' kernel on the GPU.
- uint64_t args_size = sizeof(args_t);
+ uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};
@@ -342,7 +342,7 @@ int load_nvptx(int argc, const char **argv, const char **envp, void *image,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
- end_args_t fini_args = {host_ret};
+ end_args_t fini_args = {};
if (CUresult err =
launch_kernel(binary, stream, server, single_threaded_params, "_end",
fini_args, print_resource_usage))
More information about the llvm-commits
mailing list