[clang-tools-extra] [clang-tidy] Add documentation and smoke test for CUDA (PR #173699)

Sat Dec 27 02:25:09 PST 2025

https://github.com/zeyi2 updated https://github.com/llvm/llvm-project/pull/173699

>From 1fe8fff7fdf129c117d86d4bac19c877032b5f3d Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 12:12:09 +0800
Subject: [PATCH 1/3] [clang-tidy] Add documentation and smoke test for CUDA

---
 clang-tools-extra/docs/clang-tidy/index.rst        | 14 ++++++++++++++
 .../test/clang-tidy/infrastructure/basic-cuda.cu   |  9 +++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu

diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 34da529902308..38aabc77540cf 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -349,6 +349,20 @@ An overview of all the command-line options:
         some-check.SomeOption: 'some value'
       ...
 
+Running Clang-Tidy on CUDA Files
+--------------------------------
+
+:program:`clang-tidy` supports analyzing CUDA source files.
+To correctly process host-side code, specify the CUDA toolkit path using
+``--cuda-path`` and limit compilation to the host with ``--cuda-host-only``.
+
+.. code-block:: console
+
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-host-only
+
+Using ``--cuda-host-only`` is recommended as it skips device-side compilation,
+speeding up the analysis and avoiding potential device-specific errors.
+
 Clang-Tidy Automation
 =====================
 
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
new file mode 100644
index 0000000000000..3bc605d864461
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -0,0 +1,9 @@
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- -nocudainc -nocudalib --cuda-host-only | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// CHECK: :[[@LINE+1]]:38: warning: use nullptr [modernize-use-nullptr]
+__global__ void kernel(int *p) { p = 0; }
+
+// CHECK: :[[@LINE+1]]:11: warning: use nullptr [modernize-use-nullptr]
+void *p = 0;

>From 13e3f45d598d5a62e475c0256b629a6e4fad103c Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 17:53:47 +0800
Subject: [PATCH 2/3] Address review feedback

---
 clang-tools-extra/docs/clang-tidy/index.rst   |  17 +-
 .../usr/local/cuda/include/cuda_runtime.h     | 253 ++++++++++++++++++
 .../clang-tidy/infrastructure/basic-cuda.cu   |  11 +-
 3 files changed, 273 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h

diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 38aabc77540cf..4a0bab8693878 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -352,16 +352,21 @@ An overview of all the command-line options:
 Running Clang-Tidy on CUDA Files
 --------------------------------
 
-:program:`clang-tidy` supports analyzing CUDA source files.
-To correctly process host-side code, specify the CUDA toolkit path using
-``--cuda-path`` and limit compilation to the host with ``--cuda-host-only``.
+:program:`clang-tidy` supports analyzing CUDA source files. To ensure correct
+header resolution, it is important to specify the CUDA toolkit path using
+``--cuda-path``. For more details on how Clang handles CUDA, see
+`Compiling CUDA with Clang <https://llvm.org/docs/CompileCudaWithLLVM.html>`_.
 
 .. code-block:: console
 
-  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-host-only
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda
 
-Using ``--cuda-host-only`` is recommended as it skips device-side compilation,
-speeding up the analysis and avoiding potential device-specific errors.
+By default, :program:`clang-tidy` will compile the code for the host. To
+analyze device-side code, use the ``--cuda-device-only`` flag:
+
+.. code-block:: console
+
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-device-only
 
 Clang-Tidy Automation
 =====================
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
new file mode 100644
index 0000000000000..421fa4dd7dbae
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
@@ -0,0 +1,253 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+                                                   void *parameterBuffer,
+                                                   dim3 gridDim, dim3 blockDim,
+                                                   unsigned int sharedMem,
+                                                   cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+                                                   size_t size);
+#endif
+
+extern "C" __device__ int printf(const char*, ...);
+
+struct char1 {
+  char x;
+  __host__ __device__ char1(char x = 0) : x(x) {}
+};
+struct char2 {
+  char x, y;
+  __host__ __device__ char2(char x = 0, char y = 0) : x(x), y(y) {}
+};
+struct char4 {
+  char x, y, z, w;
+  __host__ __device__ char4(char x = 0, char y = 0, char z = 0, char w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct uchar1 {
+  unsigned char x;
+  __host__ __device__ uchar1(unsigned char x = 0) : x(x) {}
+};
+struct uchar2 {
+  unsigned char x, y;
+  __host__ __device__ uchar2(unsigned char x = 0, unsigned char y = 0) : x(x), y(y) {}
+};
+struct uchar4 {
+  unsigned char x, y, z, w;
+  __host__ __device__ uchar4(unsigned char x = 0, unsigned char y = 0, unsigned char z = 0, unsigned char w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct short1 {
+  short x;
+  __host__ __device__ short1(short x = 0) : x(x) {}
+};
+struct short2 {
+  short x, y;
+  __host__ __device__ short2(short x = 0, short y = 0) : x(x), y(y) {}
+};
+struct short4 {
+  short x, y, z, w;
+  __host__ __device__ short4(short x = 0, short y = 0, short z = 0, short w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct ushort1 {
+  unsigned short x;
+  __host__ __device__ ushort1(unsigned short x = 0) : x(x) {}
+};
+struct ushort2 {
+  unsigned short x, y;
+  __host__ __device__ ushort2(unsigned short x = 0, unsigned short y = 0) : x(x), y(y) {}
+};
+struct ushort4 {
+  unsigned short x, y, z, w;
+  __host__ __device__ ushort4(unsigned short x = 0, unsigned short y = 0, unsigned short z = 0, unsigned short w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct int1 {
+  int x;
+  __host__ __device__ int1(int x = 0) : x(x) {}
+};
+struct int2 {
+  int x, y;
+  __host__ __device__ int2(int x = 0, int y = 0) : x(x), y(y) {}
+};
+struct int4 {
+  int x, y, z, w;
+  __host__ __device__ int4(int x = 0, int y = 0, int z = 0, int w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct uint1 {
+  unsigned x;
+  __host__ __device__ uint1(unsigned x = 0) : x(x) {}
+};
+struct uint2 {
+  unsigned x, y;
+  __host__ __device__ uint2(unsigned x = 0, unsigned y = 0) : x(x), y(y) {}
+};
+struct uint3 {
+  unsigned x, y, z;
+  __host__ __device__ uint3(unsigned x = 0, unsigned y = 0, unsigned z = 0) : x(x), y(y), z(z) {}
+};
+struct uint4 {
+  unsigned x, y, z, w;
+  __host__ __device__ uint4(unsigned x = 0, unsigned y = 0, unsigned z = 0, unsigned w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct longlong1 {
+  long long x;
+  __host__ __device__ longlong1(long long x = 0) : x(x) {}
+};
+struct longlong2 {
+  long long x, y;
+  __host__ __device__ longlong2(long long x = 0, long long y = 0) : x(x), y(y) {}
+};
+struct longlong4 {
+  long long x, y, z, w;
+  __host__ __device__ longlong4(long long x = 0, long long y = 0, long long z = 0, long long w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct ulonglong1 {
+  unsigned long long x;
+  __host__ __device__ ulonglong1(unsigned long long x = 0) : x(x) {}
+};
+struct ulonglong2 {
+  unsigned long long x, y;
+  __host__ __device__ ulonglong2(unsigned long long x = 0, unsigned long long y = 0) : x(x), y(y) {}
+};
+struct ulonglong4 {
+  unsigned long long x, y, z, w;
+  __host__ __device__ ulonglong4(unsigned long long x = 0, unsigned long long y = 0, unsigned long long z = 0, unsigned long long w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct float1 {
+  float x;
+  __host__ __device__ float1(float x = 0) : x(x) {}
+};
+struct float2 {
+  float x, y;
+  __host__ __device__ float2(float x = 0, float y = 0) : x(x), y(y) {}
+};
+struct float4 {
+  float x, y, z, w;
+  __host__ __device__ float4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct double1 {
+  double x;
+  __host__ __device__ double1(double x = 0) : x(x) {}
+};
+struct double2 {
+  double x, y;
+  __host__ __device__ double2(double x = 0, double y = 0) : x(x), y(y) {}
+};
+struct double4 {
+  double x, y, z, w;
+  __host__ __device__ double4(double x = 0, double y = 0, double z = 0, double w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+typedef unsigned long long cudaTextureObject_t;
+typedef unsigned long long cudaSurfaceObject_t;
+
+enum cudaTextureReadMode {
+  cudaReadModeNormalizedFloat,
+  cudaReadModeElementType
+};
+
+enum cudaSurfaceBoundaryMode {
+  cudaBoundaryModeZero,
+  cudaBoundaryModeClamp,
+  cudaBoundaryModeTrap
+};
+
+enum {
+  cudaTextureType1D,
+  cudaTextureType2D,
+  cudaTextureType3D,
+  cudaTextureTypeCubemap,
+  cudaTextureType1DLayered,
+  cudaTextureType2DLayered,
+  cudaTextureTypeCubemapLayered
+};
+
+struct textureReference { };
+template <class T, int texType = cudaTextureType1D,
+          enum cudaTextureReadMode mode = cudaReadModeElementType>
+struct __attribute__((device_builtin_texture_type)) texture
+    : public textureReference {};
+
+struct surfaceReference { int desc; };
+
+template <typename T, int dim = 1>
+struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
index 3bc605d864461..37b3b5ab7ade6 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -1,6 +1,13 @@
-// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- -nocudainc -nocudalib --cuda-host-only | FileCheck %s
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-host-only | FileCheck %s
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-device-only | FileCheck %s
 
-#define __global__ __attribute__((global))
+#include <cuda_runtime.h>
 
 // CHECK: :[[@LINE+1]]:38: warning: use nullptr [modernize-use-nullptr]
 __global__ void kernel(int *p) { p = 0; }

>From 05c0eb5d389cad87e6c6ff680801e1bb1e1d85ad Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 18:21:41 +0800
Subject: [PATCH 3/3] Cleanup

---
 .../CUDA/{usr/local/cuda/include => }/cuda_runtime.h      | 0
 .../test/clang-tidy/infrastructure/basic-cuda.cu          | 8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/{usr/local/cuda/include => }/cuda_runtime.h (100%)

diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
similarity index 100%
rename from clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
rename to clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
index 37b3b5ab7ade6..db0260b9bc5d8 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -1,10 +1,10 @@
 // RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-path=%S/Inputs/CUDA \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
 // RUN:   --cuda-host-only | FileCheck %s
 // RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-path=%S/Inputs/CUDA \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
 // RUN:   --cuda-device-only | FileCheck %s
 
 #include <cuda_runtime.h>