[llvm] Revert "[offload] Fix CUDA args size by subtracting tail padding" (PR #173199)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 21 10:58:11 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Michał Górny (mgorny)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->172249. The changes introduced a unittest that does not compile, per https://github.com/llvm/llvm-project/pull/172249#issuecomment-3679292737.
---
Full diff: https://github.com/llvm/llvm-project/pull/173199.diff
7 Files Affected:
- (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (-1)
- (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (-1)
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+2-31)
- (modified) offload/test/offloading/CUDA/basic_launch_multi_arg.cu (+3)
- (modified) offload/unittests/OffloadAPI/device_code/CMakeLists.txt (-2)
- (removed) offload/unittests/OffloadAPI/device_code/multiargs.cpp (-3)
- (modified) offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp (-14)
``````````diff
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index f630e8d850706..e7a1ca38b3c13 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,7 +81,6 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
-DLWRAP(cuFuncGetParamInfo, 4)
DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 7e42c66dddabb..a470d6df1079d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -390,6 +390,5 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
-CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
#endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 3c41694bf9dc4..a27c6f3de0cd3 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -149,8 +149,7 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
- // Retrieve the size of the arguments.
- return initArgsSize();
+ return Plugin::success();
}
/// Launch the CUDA kernel function.
@@ -174,32 +173,11 @@ struct CUDAKernelTy : public GenericKernelTy {
}
private:
- /// Initialize the size of the arguments.
- Error initArgsSize() {
- CUresult Res;
- size_t ArgOffset, ArgSize;
- size_t Arg = 0;
-
- ArgsSize = 0;
-
- // Find the last argument to know the total size of the arguments.
- while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
- CUDA_SUCCESS)
- ArgsSize = ArgOffset + ArgSize;
-
- if (Res != CUDA_ERROR_INVALID_VALUE)
- return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
- return Plugin::success();
- }
-
/// The CUDA kernel function to execute.
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152;
-
- /// The size of the kernel arguments.
- size_t ArgsSize;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1452,12 +1430,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
- // The args size passed in LaunchParams may have tail padding, which is not
- // accepted by the CUDA driver.
- if (ArgsSize > LaunchParams.Size)
- return Plugin::error(ErrorCode::INVALID_ARGUMENT,
- "mismatch in kernel arguments");
-
CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
@@ -1465,10 +1437,9 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
- size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
- reinterpret_cast<void *>(&ConfigArgsSize),
+ reinterpret_cast<void *>(&LaunchParams.Size),
CU_LAUNCH_PARAM_END};
// If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 4e0f3a41a7a0c..7a32983f51f7c 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -6,6 +6,9 @@
// clang-format on
// REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
#include <stdio.h>
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 22ebacf62e83e..1a042e1b38315 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,7 +2,6 @@ add_offload_test_device_code(foo.cpp foo)
add_offload_test_device_code(bar.cpp bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.cpp noargs -O3)
-add_offload_test_device_code(multiargs.cpp multiargs -O3)
add_offload_test_device_code(byte.cpp byte)
add_offload_test_device_code(localmem.cpp localmem)
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -16,7 +15,6 @@ add_custom_target(offload_device_binaries DEPENDS
foo.bin
bar.bin
noargs.bin
- multiargs.bin
byte.bin
localmem.bin
localmem_reduction.bin
diff --git a/offload/unittests/OffloadAPI/device_code/multiargs.cpp b/offload/unittests/OffloadAPI/device_code/multiargs.cpp
deleted file mode 100644
index 265dad124e91e..0000000000000
--- a/offload/unittests/OffloadAPI/device_code/multiargs.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <gpuintrin.h>
-
-extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 0845b9a1afdb7..c9eca36a4d447 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -55,7 +55,6 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
KERNEL_TEST(Foo, foo)
KERNEL_TEST(NoArgs, noargs)
-KERNEL_TEST(MultiArgs, multiargs)
KERNEL_TEST(Byte, byte)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -136,19 +135,6 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(olSyncQueue(Queue));
}
-TEST_P(olLaunchKernelMultiTest, Success) {
- struct {
- char A;
- int *B;
- short C;
- } Args{0, nullptr, 0};
-
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
-
- ASSERT_SUCCESS(olSyncQueue(Queue));
-}
-
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
``````````
</details>
https://github.com/llvm/llvm-project/pull/173199
More information about the llvm-commits
mailing list