[llvm] [offload] Fix CUDA args size by subtracting tail padding (PR #172249)

Sun Dec 14 20:30:42 PST 2025

https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/172249

>From f6c79b2e7d70f0483778d0706a538fc58ebe0fa6 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 20:26:29 -0800
Subject: [PATCH 1/2] [offload] Fix CUDA args size by subtracting tail padding

---
 .../cuda/dynamic_cuda/cuda.cpp                |  1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  1 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 29 +++++++++++++++++--
 .../offloading/CUDA/basic_launch_multi_arg.cu |  3 --
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e7a1ca38b3c13..f630e8d850706 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
 DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
+DLWRAP(cuFuncGetParamInfo, 4)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index a470d6df1079d..7e42c66dddabb 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        CUmemAllocationGranularity_flags option);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
                                           CUoccupancyB2DSize, size_t, int);
+CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a27c6f3de0cd3..6ac48255693d8 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
     // The maximum number of threads cannot exceed the maximum of the kernel.
     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 
-    return Plugin::success();
+    // Retrieve the size of the arguments.
+    return initArgsSize();
   }
 
   /// Launch the CUDA kernel function.
@@ -173,11 +174,29 @@ struct CUDAKernelTy : public GenericKernelTy {
   }
 
 private:
+  /// Initialize the size of the arguments.
+  Error initArgsSize() {
+    CUresult Res;
+    size_t ArgOffset, ArgSize;
+    size_t Arg = 0;
+
+    // Find the last argument to know the total size of the arguments.
+    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS)
+      ArgsSize = ArgOffset + ArgSize;
+
+    if (Res != CUDA_ERROR_INVALID_VALUE)
+      return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
+    return Plugin::success();
+  }
+
   /// The CUDA kernel function to execute.
   CUfunction Func;
   /// The maximum amount of dynamic shared memory per thread group. By default,
   /// this is set to 48 KB.
   mutable uint32_t MaxDynCGroupMemLimit = 49152;
+
+  /// The size of the kernel arguments.
+  size_t ArgsSize = 0;
 };
 
 /// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1430,6 +1449,11 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
 
+  // The args size passed in LaunchParams may have tail padding, which is not
+  // accepted by the CUDA driver.
+  if (ArgsSize > LaunchParams.Size)
+    return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments");
+
   CUstream Stream;
   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
@@ -1437,9 +1461,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   uint32_t MaxDynCGroupMem =
       std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
 
+  size_t ConfigArgsSize = ArgsSize;
   void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&LaunchParams.Size),
+                    reinterpret_cast<void *>(&ConfigArgsSize),
                     CU_LAUNCH_PARAM_END};
 
   // If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 7a32983f51f7c..4e0f3a41a7a0c 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -6,9 +6,6 @@
 // clang-format on
 
 // REQUIRES: gpu
-//
-// FIXME: https://github.com/llvm/llvm-project/issues/161265
-// UNSUPPORTED: gpu
 
 #include <stdio.h>
 

>From cdd16a7392a8bca067a6dcfec98f674c63be5e84 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 14 Dec 2025 20:30:00 -0800
Subject: [PATCH 2/2] Fix format

---
 offload/plugins-nextgen/cuda/src/rtl.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 6ac48255693d8..a60c3ca032d76 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -181,7 +181,8 @@ struct CUDAKernelTy : public GenericKernelTy {
     size_t Arg = 0;
 
     // Find the last argument to know the total size of the arguments.
-    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS)
+    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
+           CUDA_SUCCESS)
       ArgsSize = ArgOffset + ArgSize;
 
     if (Res != CUDA_ERROR_INVALID_VALUE)
@@ -1452,7 +1453,8 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   // The args size passed in LaunchParams may have tail padding, which is not
   // accepted by the CUDA driver.
   if (ArgsSize > LaunchParams.Size)
-    return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments");
+    return Plugin::error(ErrorCode::INVALID_BINARY,
+                         "mismatch in kernel arguments");
 
   CUstream Stream;
   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))