[clang] [llvm] [clang][CUDA] Avoid accounting for tail padding in LLVM offloading (PR #156229)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 31 00:49:08 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Kevin Sala Penades (kevinsala)
<details>
<summary>Changes</summary>
It seems that `cuLaunchKernel` expects the arguments size (`CU_LAUNCH_PARAM_BUFFER_SIZE`) without accounting for tail padding. For example, for a kernel with arguments `int *A, short B`, the function requires a size of 12 bytes. However, we are currently passing the `sizeof(struct { int *A, short B })`, which results in 16 bytes.
This commit exposes both sizes into the `KernelLaunchParamsTy` so the plugins can decide which one to use. It fixes the `offload/test/offloading/CUDA/basic_launch_multi_arg.cu` test on NVIDIA GPUs, which was failing with error _too many resources requested for launch_.
---
Full diff: https://github.com/llvm/llvm-project/pull/156229.diff
5 Files Affected:
- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+19-5)
- (modified) offload/include/Shared/APITypes.h (+2)
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+3-1)
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+1-1)
- (modified) offload/test/offloading/CUDA/basic_launch_multi_arg.cu (+8)
``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 5090a0559eab2..1f3492d57c6a1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -327,9 +327,10 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
/// function. For the LLVM/offload launch we flatten the arguments into the
/// struct directly. In addition, we include the size of the arguments, thus
-/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
-/// nullptr}. The last nullptr needs to be initialized to an array of pointers
-/// pointing to the arguments if we want to offload to the host.
+/// pass {size of ({void *, short, void *}) without tail padding, ptr to {void
+/// *, short, void *}, nullptr}. The last nullptr needs to be initialized to an
+/// array of pointers pointing to the arguments if we want to offload to the
+/// host.
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
FunctionArgList &Args) {
SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
@@ -339,6 +340,7 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
auto *Int64Ty = CGF.Builder.getInt64Ty();
KernelLaunchParamsTypes.push_back(Int64Ty);
+ KernelLaunchParamsTypes.push_back(Int64Ty);
KernelLaunchParamsTypes.push_back(PtrTy);
KernelLaunchParamsTypes.push_back(PtrTy);
@@ -351,12 +353,24 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
"kernel_launch_params");
auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
+
+ // Avoid accounting the tail padding for CUDA.
+ auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero();
+ if (auto N = KernelArgsTy->getNumElements()) {
+ auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy);
+ KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1);
+ KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize(
+ KernelArgsTy->getElementType(N - 1));
+ }
+
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
- CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+ CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
- CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+ CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+ CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+ CGF.Builder.CreateStructGEP(KernelLaunchParams, 3));
for (unsigned i = 0; i < Args.size(); ++i) {
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..52725a0474c6a 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -121,6 +121,8 @@ static_assert(sizeof(KernelArgsTy) ==
struct KernelLaunchParamsTy {
/// Size of the Data array.
size_t Size = 0;
+ /// Size of the Data array without tail padding.
+ size_t SizeNoTailPadding = 0;
/// Flat array of kernel parameters.
void *Data = nullptr;
/// Ptrs to the Data entries. Only strictly required for the host plugin.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d4b5f914c6672..238f6dccc6640 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -627,7 +627,9 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
Ptrs[I] = &Args[I];
}
- return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
+
+ size_t ArgsSize = sizeof(void *) * NumArgs;
+ return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]};
}
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index c7984287f7533..ddb21f1678a6a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1430,7 +1430,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
- reinterpret_cast<void *>(&LaunchParams.Size),
+ reinterpret_cast<void *>(&LaunchParams.SizeNoTailPadding),
CU_LAUNCH_PARAM_END};
// If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 1f84a0e1288d4..ab6f753150932 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -23,6 +23,10 @@ __global__ void square(int *Dst, short Q, int *Src, short P) {
Src[1] = P;
}
+__global__ void accumulate(short Q, int *Dst, char P) {
+ *Dst += Q + P;
+}
+
int main(int argc, char **argv) {
int DevNo = 0;
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
@@ -39,5 +43,9 @@ int main(int argc, char **argv) {
// CHECK: Ptr [[Ptr]], *Ptr: 42
printf("Src: %i : %i\n", Src[0], Src[1]);
// CHECK: Src: 3 : 4
+ accumulate<<<1, 1>>>(3, Ptr, 7);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 52
llvm_omp_target_free_shared(Ptr, DevNo);
+ llvm_omp_target_free_shared(Src, DevNo);
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/156229
More information about the llvm-commits
mailing list