[clang] [llvm] [clang][CUDA] Avoid accounting for tail padding in LLVM offloading (PR #156229)

Wed Sep 3 10:44:44 PDT 2025

kevinsala wrote:

My understanding is that the CUDA Driver API requires the arguments in the buffer to be placed with the proper alignment (i.e., padding between fields). However, the trailing padding after the last element should not be accounted. Otherwise, if it's accounted, the `cuLaunchKernel` call fails.

The documentation of `cuLaunchKernel` says:

> Kernel parameters can also be packaged by the application into a single buffer that is passed in via the extra parameter. This places the burden on the application of knowing each kernel parameter's size and alignment/padding within the buffer.

and

> CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra will be a pointer to a size_t containing the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER.

The following code is a simple reproducer that works directly on top of the CUDA Driver API:

`kernel.cu`:
```c++
extern "C" __global__ void kernel(int *arg1, short arg2, int *arg3, short arg4) {
  *arg1 = arg2;
  *arg3 = arg4;
}
```

`main.cu`:
```c++
#include <cstdio>
#include <cuda.h>

#define CU_CHECK(err) \
  do { \
    CUresult err__ = (err); \
    if (err__ != CUDA_SUCCESS) { \
      const char *errStr; \
      cuGetErrorString(err__, &errStr); \
      fprintf(stderr, "Error: %s\n", errStr ? errStr : "Unknown"); \
      exit(1); \
    } \
  } while (0)

int main(int argc, char **argv) {
  CU_CHECK(cuInit(0));

  CUdevice device;
  CU_CHECK(cuDeviceGet(&device, 0));

  CUcontext context;
  CU_CHECK(cuCtxCreate(&context, 0, device));

  CUmodule module;
  CU_CHECK(cuModuleLoad(&module, "kernel.cubin"));

  CUfunction kernel;
  CU_CHECK(cuModuleGetFunction(&kernel, module, "kernel"));

  CUdeviceptr d_arg1, d_arg3;
  CU_CHECK(cuMemAlloc(&d_arg1, sizeof(int)));
  CU_CHECK(cuMemAlloc(&d_arg3, sizeof(int)));

  short arg2 = 2, arg4 = 4;

  struct Args {
    CUdeviceptr arg1;
    short arg2;
    CUdeviceptr arg3;
    short arg4;
  };

  Args args = { d_arg1, arg2, d_arg3, arg4 };
  size_t size = 8 + 8 + 8 + 2; // OK
  // size_t size = sizeof(Args); // ERROR

  void *config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
                     reinterpret_cast<void *>(&size),
                     CU_LAUNCH_PARAM_END };

  CU_CHECK(cuLaunchKernel(
    kernel, 1, 1, 1, 1, 1, 1,
    0, 0, nullptr, config
  ));

  CU_CHECK(cuCtxSynchronize());

  int h_out = 0;
  CU_CHECK(cuMemcpyDtoH(&h_out, d_arg1, sizeof(int)));
  printf("Result from kernel: %d\n", h_out);

  CU_CHECK(cuMemFree(d_arg1));
  CU_CHECK(cuMemFree(d_arg3));
  CU_CHECK(cuModuleUnload(module));
  CU_CHECK(cuCtxDestroy(context));

  return 0;
}
```

Commands to build the reproducer:
```sh
nvcc -arch=sm_90 --cubin kernel.cu -o kernel.cubin
nvcc -arch=sm_90 main.cu -o main -lcuda
./main
```

The work works at is it, passing the size skipping the trailing padding. If it is replaced by `sizeof(Args)`, the CUDA call fails.

https://github.com/llvm/llvm-project/pull/156229