[llvm] [offload] Fix CUDA args size by subtracting tail padding (PR #172249)
Kevin Sala Penades via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 14 21:14:11 PST 2025
https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/172249
>From f6c79b2e7d70f0483778d0706a538fc58ebe0fa6 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 20:26:29 -0800
Subject: [PATCH 1/4] [offload] Fix CUDA args size by subtracting tail padding
---
.../cuda/dynamic_cuda/cuda.cpp | 1 +
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 +
offload/plugins-nextgen/cuda/src/rtl.cpp | 29 +++++++++++++++++--
.../offloading/CUDA/basic_launch_multi_arg.cu | 3 --
4 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e7a1ca38b3c13..f630e8d850706 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
+DLWRAP(cuFuncGetParamInfo, 4)
DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index a470d6df1079d..7e42c66dddabb 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
+CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
#endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a27c6f3de0cd3..6ac48255693d8 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
- return Plugin::success();
+ // Retrieve the size of the arguments.
+ return initArgsSize();
}
/// Launch the CUDA kernel function.
@@ -173,11 +174,29 @@ struct CUDAKernelTy : public GenericKernelTy {
}
private:
+ /// Initialize the size of the arguments.
+ Error initArgsSize() {
+ CUresult Res;
+ size_t ArgOffset, ArgSize;
+ size_t Arg = 0;
+
+ // Find the last argument to know the total size of the arguments.
+ while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS)
+ ArgsSize = ArgOffset + ArgSize;
+
+ if (Res != CUDA_ERROR_INVALID_VALUE)
+ return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
+ return Plugin::success();
+ }
+
/// The CUDA kernel function to execute.
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152;
+
+ /// The size of the kernel arguments.
+ size_t ArgsSize = 0;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1430,6 +1449,11 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+ // The args size passed in LaunchParams may have tail padding, which is not
+ // accepted by the CUDA driver.
+ if (ArgsSize > LaunchParams.Size)
+ return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments");
+
CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
@@ -1437,9 +1461,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
+ size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
- reinterpret_cast<void *>(&LaunchParams.Size),
+ reinterpret_cast<void *>(&ConfigArgsSize),
CU_LAUNCH_PARAM_END};
// If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 7a32983f51f7c..4e0f3a41a7a0c 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -6,9 +6,6 @@
// clang-format on
// REQUIRES: gpu
-//
-// FIXME: https://github.com/llvm/llvm-project/issues/161265
-// UNSUPPORTED: gpu
#include <stdio.h>
>From cdd16a7392a8bca067a6dcfec98f674c63be5e84 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 20:30:00 -0800
Subject: [PATCH 2/4] Fix format
---
offload/plugins-nextgen/cuda/src/rtl.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 6ac48255693d8..a60c3ca032d76 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -181,7 +181,8 @@ struct CUDAKernelTy : public GenericKernelTy {
size_t Arg = 0;
// Find the last argument to know the total size of the arguments.
- while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS)
+ while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
+ CUDA_SUCCESS)
ArgsSize = ArgOffset + ArgSize;
if (Res != CUDA_ERROR_INVALID_VALUE)
@@ -1452,7 +1453,8 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// The args size passed in LaunchParams may have tail padding, which is not
// accepted by the CUDA driver.
if (ArgsSize > LaunchParams.Size)
- return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments");
+ return Plugin::error(ErrorCode::INVALID_BINARY,
+ "mismatch in kernel arguments");
CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
>From 2bbfdd22234b4ed4b8b4b7dd707413798e366ea3 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 20:37:39 -0800
Subject: [PATCH 3/4] Fix comments
---
offload/plugins-nextgen/cuda/src/rtl.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a60c3ca032d76..3c41694bf9dc4 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -180,6 +180,8 @@ struct CUDAKernelTy : public GenericKernelTy {
size_t ArgOffset, ArgSize;
size_t Arg = 0;
+ ArgsSize = 0;
+
// Find the last argument to know the total size of the arguments.
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
CUDA_SUCCESS)
@@ -197,7 +199,7 @@ struct CUDAKernelTy : public GenericKernelTy {
mutable uint32_t MaxDynCGroupMemLimit = 49152;
/// The size of the kernel arguments.
- size_t ArgsSize = 0;
+ size_t ArgsSize;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1453,7 +1455,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// The args size passed in LaunchParams may have tail padding, which is not
// accepted by the CUDA driver.
if (ArgsSize > LaunchParams.Size)
- return Plugin::error(ErrorCode::INVALID_BINARY,
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"mismatch in kernel arguments");
CUstream Stream;
>From 8934542d9a84787d56b9757fc1596c516dd6f465 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 21:11:55 -0800
Subject: [PATCH 4/4] Add offload test with multiple kernel args
---
.../OffloadAPI/device_code/CMakeLists.txt | 2 ++
.../unittests/OffloadAPI/device_code/multiargs.cpp | 3 +++
.../unittests/OffloadAPI/kernel/olLaunchKernel.cpp | 14 ++++++++++++++
3 files changed, 19 insertions(+)
create mode 100644 offload/unittests/OffloadAPI/device_code/multiargs.cpp
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 1a042e1b38315..22ebacf62e83e 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
add_offload_test_device_code(bar.cpp bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.cpp noargs -O3)
+add_offload_test_device_code(multiargs.cpp multiargs -O3)
add_offload_test_device_code(byte.cpp byte)
add_offload_test_device_code(localmem.cpp localmem)
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
foo.bin
bar.bin
noargs.bin
+ multiargs.bin
byte.bin
localmem.bin
localmem_reduction.bin
diff --git a/offload/unittests/OffloadAPI/device_code/multiargs.cpp b/offload/unittests/OffloadAPI/device_code/multiargs.cpp
new file mode 100644
index 0000000000000..265dad124e91e
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/multiargs.cpp
@@ -0,0 +1,3 @@
+#include <gpuintrin.h>
+
+extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index c9eca36a4d447..0845b9a1afdb7 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
KERNEL_TEST(Foo, foo)
KERNEL_TEST(NoArgs, noargs)
+KERNEL_TEST(MultiArgs, multiargs)
KERNEL_TEST(Byte, byte)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(olSyncQueue(Queue));
}
+TEST_P(olLaunchKernelMultiTest, Success) {
+ struct {
+ char A;
+ int *B;
+ short C;
+ } Args{0, nullptr, 0};
+
+ ASSERT_SUCCESS(
+ olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
+
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
More information about the llvm-commits
mailing list