[clang-tools-extra] [clang-tidy] Add documentation and smoke test for CUDA (PR #173699)

Sun Dec 28 02:13:35 PST 2025

https://github.com/zeyi2 updated https://github.com/llvm/llvm-project/pull/173699

>From 1fe8fff7fdf129c117d86d4bac19c877032b5f3d Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 12:12:09 +0800
Subject: [PATCH 1/4] [clang-tidy] Add documentation and smoke test for CUDA

---
 clang-tools-extra/docs/clang-tidy/index.rst        | 14 ++++++++++++++
 .../test/clang-tidy/infrastructure/basic-cuda.cu   |  9 +++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu

diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 34da529902308..38aabc77540cf 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -349,6 +349,20 @@ An overview of all the command-line options:
         some-check.SomeOption: 'some value'
       ...
 
+Running Clang-Tidy on CUDA Files
+--------------------------------
+
+:program:`clang-tidy` supports analyzing CUDA source files.
+To correctly process host-side code, specify the CUDA toolkit path using
+``--cuda-path`` and limit compilation to the host with ``--cuda-host-only``.
+
+.. code-block:: console
+
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-host-only
+
+Using ``--cuda-host-only`` is recommended as it skips device-side compilation,
+speeding up the analysis and avoiding potential device-specific errors.
+
 Clang-Tidy Automation
 =====================
 
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
new file mode 100644
index 0000000000000..3bc605d864461
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -0,0 +1,9 @@
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- -nocudainc -nocudalib --cuda-host-only | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// CHECK: :[[@LINE+1]]:38: warning: use nullptr [modernize-use-nullptr]
+__global__ void kernel(int *p) { p = 0; }
+
+// CHECK: :[[@LINE+1]]:11: warning: use nullptr [modernize-use-nullptr]
+void *p = 0;

>From 13e3f45d598d5a62e475c0256b629a6e4fad103c Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 17:53:47 +0800
Subject: [PATCH 2/4] Address review feedback

---
 clang-tools-extra/docs/clang-tidy/index.rst   |  17 +-
 .../usr/local/cuda/include/cuda_runtime.h     | 253 ++++++++++++++++++
 .../clang-tidy/infrastructure/basic-cuda.cu   |  11 +-
 3 files changed, 273 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h

diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 38aabc77540cf..4a0bab8693878 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -352,16 +352,21 @@ An overview of all the command-line options:
 Running Clang-Tidy on CUDA Files
 --------------------------------
 
-:program:`clang-tidy` supports analyzing CUDA source files.
-To correctly process host-side code, specify the CUDA toolkit path using
-``--cuda-path`` and limit compilation to the host with ``--cuda-host-only``.
+:program:`clang-tidy` supports analyzing CUDA source files. To ensure correct
+header resolution, it is important to specify the CUDA toolkit path using
+``--cuda-path``. For more details on how Clang handles CUDA, see
+`Compiling CUDA with Clang <https://llvm.org/docs/CompileCudaWithLLVM.html>`_.
 
 .. code-block:: console
 
-  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-host-only
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda
 
-Using ``--cuda-host-only`` is recommended as it skips device-side compilation,
-speeding up the analysis and avoiding potential device-specific errors.
+By default, :program:`clang-tidy` will compile the code for the host. To
+analyze device-side code, use the ``--cuda-device-only`` flag:
+
+.. code-block:: console
+
+  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda --cuda-device-only
 
 Clang-Tidy Automation
 =====================
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
new file mode 100644
index 0000000000000..421fa4dd7dbae
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
@@ -0,0 +1,253 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+                                                   void *parameterBuffer,
+                                                   dim3 gridDim, dim3 blockDim,
+                                                   unsigned int sharedMem,
+                                                   cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+                                                   size_t size);
+#endif
+
+extern "C" __device__ int printf(const char*, ...);
+
+struct char1 {
+  char x;
+  __host__ __device__ char1(char x = 0) : x(x) {}
+};
+struct char2 {
+  char x, y;
+  __host__ __device__ char2(char x = 0, char y = 0) : x(x), y(y) {}
+};
+struct char4 {
+  char x, y, z, w;
+  __host__ __device__ char4(char x = 0, char y = 0, char z = 0, char w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct uchar1 {
+  unsigned char x;
+  __host__ __device__ uchar1(unsigned char x = 0) : x(x) {}
+};
+struct uchar2 {
+  unsigned char x, y;
+  __host__ __device__ uchar2(unsigned char x = 0, unsigned char y = 0) : x(x), y(y) {}
+};
+struct uchar4 {
+  unsigned char x, y, z, w;
+  __host__ __device__ uchar4(unsigned char x = 0, unsigned char y = 0, unsigned char z = 0, unsigned char w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct short1 {
+  short x;
+  __host__ __device__ short1(short x = 0) : x(x) {}
+};
+struct short2 {
+  short x, y;
+  __host__ __device__ short2(short x = 0, short y = 0) : x(x), y(y) {}
+};
+struct short4 {
+  short x, y, z, w;
+  __host__ __device__ short4(short x = 0, short y = 0, short z = 0, short w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct ushort1 {
+  unsigned short x;
+  __host__ __device__ ushort1(unsigned short x = 0) : x(x) {}
+};
+struct ushort2 {
+  unsigned short x, y;
+  __host__ __device__ ushort2(unsigned short x = 0, unsigned short y = 0) : x(x), y(y) {}
+};
+struct ushort4 {
+  unsigned short x, y, z, w;
+  __host__ __device__ ushort4(unsigned short x = 0, unsigned short y = 0, unsigned short z = 0, unsigned short w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct int1 {
+  int x;
+  __host__ __device__ int1(int x = 0) : x(x) {}
+};
+struct int2 {
+  int x, y;
+  __host__ __device__ int2(int x = 0, int y = 0) : x(x), y(y) {}
+};
+struct int4 {
+  int x, y, z, w;
+  __host__ __device__ int4(int x = 0, int y = 0, int z = 0, int w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct uint1 {
+  unsigned x;
+  __host__ __device__ uint1(unsigned x = 0) : x(x) {}
+};
+struct uint2 {
+  unsigned x, y;
+  __host__ __device__ uint2(unsigned x = 0, unsigned y = 0) : x(x), y(y) {}
+};
+struct uint3 {
+  unsigned x, y, z;
+  __host__ __device__ uint3(unsigned x = 0, unsigned y = 0, unsigned z = 0) : x(x), y(y), z(z) {}
+};
+struct uint4 {
+  unsigned x, y, z, w;
+  __host__ __device__ uint4(unsigned x = 0, unsigned y = 0, unsigned z = 0, unsigned w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct longlong1 {
+  long long x;
+  __host__ __device__ longlong1(long long x = 0) : x(x) {}
+};
+struct longlong2 {
+  long long x, y;
+  __host__ __device__ longlong2(long long x = 0, long long y = 0) : x(x), y(y) {}
+};
+struct longlong4 {
+  long long x, y, z, w;
+  __host__ __device__ longlong4(long long x = 0, long long y = 0, long long z = 0, long long w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct ulonglong1 {
+  unsigned long long x;
+  __host__ __device__ ulonglong1(unsigned long long x = 0) : x(x) {}
+};
+struct ulonglong2 {
+  unsigned long long x, y;
+  __host__ __device__ ulonglong2(unsigned long long x = 0, unsigned long long y = 0) : x(x), y(y) {}
+};
+struct ulonglong4 {
+  unsigned long long x, y, z, w;
+  __host__ __device__ ulonglong4(unsigned long long x = 0, unsigned long long y = 0, unsigned long long z = 0, unsigned long long w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct float1 {
+  float x;
+  __host__ __device__ float1(float x = 0) : x(x) {}
+};
+struct float2 {
+  float x, y;
+  __host__ __device__ float2(float x = 0, float y = 0) : x(x), y(y) {}
+};
+struct float4 {
+  float x, y, z, w;
+  __host__ __device__ float4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+struct double1 {
+  double x;
+  __host__ __device__ double1(double x = 0) : x(x) {}
+};
+struct double2 {
+  double x, y;
+  __host__ __device__ double2(double x = 0, double y = 0) : x(x), y(y) {}
+};
+struct double4 {
+  double x, y, z, w;
+  __host__ __device__ double4(double x = 0, double y = 0, double z = 0, double w = 0) : x(x), y(y), z(z), w(w) {}
+};
+
+typedef unsigned long long cudaTextureObject_t;
+typedef unsigned long long cudaSurfaceObject_t;
+
+enum cudaTextureReadMode {
+  cudaReadModeNormalizedFloat,
+  cudaReadModeElementType
+};
+
+enum cudaSurfaceBoundaryMode {
+  cudaBoundaryModeZero,
+  cudaBoundaryModeClamp,
+  cudaBoundaryModeTrap
+};
+
+enum {
+  cudaTextureType1D,
+  cudaTextureType2D,
+  cudaTextureType3D,
+  cudaTextureTypeCubemap,
+  cudaTextureType1DLayered,
+  cudaTextureType2DLayered,
+  cudaTextureTypeCubemapLayered
+};
+
+struct textureReference { };
+template <class T, int texType = cudaTextureType1D,
+          enum cudaTextureReadMode mode = cudaReadModeElementType>
+struct __attribute__((device_builtin_texture_type)) texture
+    : public textureReference {};
+
+struct surfaceReference { int desc; };
+
+template <typename T, int dim = 1>
+struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
index 3bc605d864461..37b3b5ab7ade6 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -1,6 +1,13 @@
-// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- -nocudainc -nocudalib --cuda-host-only | FileCheck %s
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-host-only | FileCheck %s
+// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-device-only | FileCheck %s
 
-#define __global__ __attribute__((global))
+#include <cuda_runtime.h>
 
 // CHECK: :[[@LINE+1]]:38: warning: use nullptr [modernize-use-nullptr]
 __global__ void kernel(int *p) { p = 0; }

>From 05c0eb5d389cad87e6c6ff680801e1bb1e1d85ad Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 18:21:41 +0800
Subject: [PATCH 3/4] Cleanup

---
 .../CUDA/{usr/local/cuda/include => }/cuda_runtime.h      | 0
 .../test/clang-tidy/infrastructure/basic-cuda.cu          | 8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/{usr/local/cuda/include => }/cuda_runtime.h (100%)

diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
similarity index 100%
rename from clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/usr/local/cuda/include/cuda_runtime.h
rename to clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
index 37b3b5ab7ade6..db0260b9bc5d8 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
@@ -1,10 +1,10 @@
 // RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-path=%S/Inputs/CUDA \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
 // RUN:   --cuda-host-only | FileCheck %s
 // RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA/usr/local/cuda/include \
+// RUN:   --cuda-path=%S/Inputs/CUDA \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
 // RUN:   --cuda-device-only | FileCheck %s
 
 #include <cuda_runtime.h>

>From b1d5098b114341da7487e453ff7a46841d835d9f Mon Sep 17 00:00:00 2001
From: mtx <mitchell.xu2 at gmail.com>
Date: Sun, 28 Dec 2025 18:12:35 +0800
Subject: [PATCH 4/4] Address review feedback

---
 clang-tools-extra/docs/clang-tidy/index.rst   |  16 +-
 .../test/clang-tidy/check_clang_tidy.py       |   2 +-
 .../infrastructure/Inputs/CUDA/cuda.h         |  64 +++++
 .../infrastructure/Inputs/CUDA/cuda_runtime.h | 253 ------------------
 .../clang-tidy/infrastructure/basic-cuda.cu   |  16 --
 .../clang-tidy/infrastructure/cuda-basic.cu   |  13 +
 6 files changed, 91 insertions(+), 273 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda.h
 delete mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
 delete mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/cuda-basic.cu

diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index 4a0bab8693878..6bd38db966396 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -357,12 +357,22 @@ header resolution, it is important to specify the CUDA toolkit path using
 ``--cuda-path``. For more details on how Clang handles CUDA, see
 `Compiling CUDA with Clang <https://llvm.org/docs/CompileCudaWithLLVM.html>`_.
 
+If you are using a GCC + NVCC build setup, the compiler command database will
+contain NVCC-specific flags that :program:`clang-tidy` does not understand.
+
+In this case, you should use the ``RemovedArgs`` configuration option (or
+``--removed-arg`` command-line option) to remove these flags, and
+``ExtraArgs`` (or ``--extra-arg``) to provide the ``--cuda-path``.
+
+For example, to remove the NVCC-specific ``-xcu`` flag:
+
 .. code-block:: console
 
-  $ clang-tidy source.cu -- --cuda-path=/path/to/cuda
+  $ clang-tidy source.cu --removed-arg="-xcu" --extra-arg="--cuda-path=/path/to/cuda"
 
-By default, :program:`clang-tidy` will compile the code for the host. To
-analyze device-side code, use the ``--cuda-device-only`` flag:
+By default, :program:`clang-tidy` will use the host compilation, which is
+sufficient to analyze both host and device code. To specifically perform device
+compilation, use the ``--cuda-device-only`` flag:
 
 .. code-block:: console
 
diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
index b173ecf4fbdca..4e42f0b12516b 100755
--- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
@@ -110,7 +110,7 @@ def __init__(self, args: argparse.Namespace, extra_args: List[str]) -> None:
 
         file_name_with_extension = self.assume_file_name or self.input_file_name
         _, extension = os.path.splitext(file_name_with_extension)
-        if extension not in [".c", ".hpp", ".m", ".mm"]:
+        if extension not in [".c", ".hpp", ".m", ".mm", ".cu"]:
             extension = ".cpp"
         self.temp_file_name = self.temp_file_name + extension
 
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda.h
new file mode 100644
index 0000000000000..562b75025c798
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda.h
@@ -0,0 +1,64 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+// Make this file work with nvcc, for testing compatibility.
+
+#ifndef __NVCC__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#define __managed__ __attribute__((managed))
+#define __grid_constant__ __attribute__((grid_constant))
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0); // NOLINT
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0); // NOLINT
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0); // NOLINT
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0); // NOLINT
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+                                                   void *parameterBuffer,
+                                                   dim3 gridDim, dim3 blockDim,
+                                                   unsigned int sharedMem,
+                                                   cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+                                                   size_t size);
+#endif
+
+// Host- and device-side placement new overloads.
+void *operator new(__SIZE_TYPE__, void *p) { return p; }
+void *operator new[](__SIZE_TYPE__, void *p) { return p; }
+__device__ void *operator new(__SIZE_TYPE__, void *p) { return p; }
+__device__ void *operator new[](__SIZE_TYPE__, void *p) { return p; }
+
+#endif // !__NVCC__
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
deleted file mode 100644
index 421fa4dd7dbae..0000000000000
--- a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/CUDA/cuda_runtime.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Minimal declarations for CUDA support.  Testing purposes only. */
-
-#include <stddef.h>
-
-#if __HIP__ || __CUDA__
-#define __constant__ __attribute__((constant))
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __host__ __attribute__((host))
-#define __shared__ __attribute__((shared))
-#if __HIP__
-#define __managed__ __attribute__((managed))
-#endif
-#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
-#define __grid_constant__ __attribute__((grid_constant))
-#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
-#define __no_cluster__ __attribute__((no_cluster))
-#else
-#define __constant__
-#define __device__
-#define __global__
-#define __host__
-#define __shared__
-#define __managed__
-#define __launch_bounds__(...)
-#define __grid_constant__
-#define __cluster_dims__(...)
-#define __no_cluster__
-#endif
-
-struct dim3 {
-  unsigned x, y, z;
-  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
-};
-
-#if __HIP__ || HIP_PLATFORM
-typedef struct hipStream *hipStream_t;
-typedef enum hipError {} hipError_t;
-int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-                     hipStream_t stream = 0);
-extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
-                                                 size_t sharedSize = 0,
-                                                 hipStream_t stream = 0);
-#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
-extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
-                                      dim3 blockDim, void **args,
-                                      size_t sharedMem,
-                                      hipStream_t stream);
-#else
-extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
-                                      dim3 blockDim, void **args,
-                                      size_t sharedMem,
-                                      hipStream_t stream);
-#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
-#elif __OFFLOAD_VIA_LLVM__
-extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
-                                     size_t sharedMem = 0, void *stream = 0);
-extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
-                          void **args, size_t sharedMem = 0, void *stream = 0);
-#else
-typedef struct cudaStream *cudaStream_t;
-typedef enum cudaError {} cudaError_t;
-extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
-                                 size_t sharedSize = 0,
-                                 cudaStream_t stream = 0);
-extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
-                                           size_t sharedSize = 0,
-                                           cudaStream_t stream = 0);
-extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
-                                        dim3 blockDim, void **args,
-                                        size_t sharedMem, cudaStream_t stream);
-extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
-                                        dim3 blockDim, void **args,
-                                        size_t sharedMem, cudaStream_t stream);
-extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
-                                                   void *parameterBuffer,
-                                                   dim3 gridDim, dim3 blockDim,
-                                                   unsigned int sharedMem,
-                                                   cudaStream_t stream);
-extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
-                                                   size_t size);
-#endif
-
-extern "C" __device__ int printf(const char*, ...);
-
-struct char1 {
-  char x;
-  __host__ __device__ char1(char x = 0) : x(x) {}
-};
-struct char2 {
-  char x, y;
-  __host__ __device__ char2(char x = 0, char y = 0) : x(x), y(y) {}
-};
-struct char4 {
-  char x, y, z, w;
-  __host__ __device__ char4(char x = 0, char y = 0, char z = 0, char w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct uchar1 {
-  unsigned char x;
-  __host__ __device__ uchar1(unsigned char x = 0) : x(x) {}
-};
-struct uchar2 {
-  unsigned char x, y;
-  __host__ __device__ uchar2(unsigned char x = 0, unsigned char y = 0) : x(x), y(y) {}
-};
-struct uchar4 {
-  unsigned char x, y, z, w;
-  __host__ __device__ uchar4(unsigned char x = 0, unsigned char y = 0, unsigned char z = 0, unsigned char w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct short1 {
-  short x;
-  __host__ __device__ short1(short x = 0) : x(x) {}
-};
-struct short2 {
-  short x, y;
-  __host__ __device__ short2(short x = 0, short y = 0) : x(x), y(y) {}
-};
-struct short4 {
-  short x, y, z, w;
-  __host__ __device__ short4(short x = 0, short y = 0, short z = 0, short w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct ushort1 {
-  unsigned short x;
-  __host__ __device__ ushort1(unsigned short x = 0) : x(x) {}
-};
-struct ushort2 {
-  unsigned short x, y;
-  __host__ __device__ ushort2(unsigned short x = 0, unsigned short y = 0) : x(x), y(y) {}
-};
-struct ushort4 {
-  unsigned short x, y, z, w;
-  __host__ __device__ ushort4(unsigned short x = 0, unsigned short y = 0, unsigned short z = 0, unsigned short w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct int1 {
-  int x;
-  __host__ __device__ int1(int x = 0) : x(x) {}
-};
-struct int2 {
-  int x, y;
-  __host__ __device__ int2(int x = 0, int y = 0) : x(x), y(y) {}
-};
-struct int4 {
-  int x, y, z, w;
-  __host__ __device__ int4(int x = 0, int y = 0, int z = 0, int w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct uint1 {
-  unsigned x;
-  __host__ __device__ uint1(unsigned x = 0) : x(x) {}
-};
-struct uint2 {
-  unsigned x, y;
-  __host__ __device__ uint2(unsigned x = 0, unsigned y = 0) : x(x), y(y) {}
-};
-struct uint3 {
-  unsigned x, y, z;
-  __host__ __device__ uint3(unsigned x = 0, unsigned y = 0, unsigned z = 0) : x(x), y(y), z(z) {}
-};
-struct uint4 {
-  unsigned x, y, z, w;
-  __host__ __device__ uint4(unsigned x = 0, unsigned y = 0, unsigned z = 0, unsigned w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct longlong1 {
-  long long x;
-  __host__ __device__ longlong1(long long x = 0) : x(x) {}
-};
-struct longlong2 {
-  long long x, y;
-  __host__ __device__ longlong2(long long x = 0, long long y = 0) : x(x), y(y) {}
-};
-struct longlong4 {
-  long long x, y, z, w;
-  __host__ __device__ longlong4(long long x = 0, long long y = 0, long long z = 0, long long w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct ulonglong1 {
-  unsigned long long x;
-  __host__ __device__ ulonglong1(unsigned long long x = 0) : x(x) {}
-};
-struct ulonglong2 {
-  unsigned long long x, y;
-  __host__ __device__ ulonglong2(unsigned long long x = 0, unsigned long long y = 0) : x(x), y(y) {}
-};
-struct ulonglong4 {
-  unsigned long long x, y, z, w;
-  __host__ __device__ ulonglong4(unsigned long long x = 0, unsigned long long y = 0, unsigned long long z = 0, unsigned long long w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct float1 {
-  float x;
-  __host__ __device__ float1(float x = 0) : x(x) {}
-};
-struct float2 {
-  float x, y;
-  __host__ __device__ float2(float x = 0, float y = 0) : x(x), y(y) {}
-};
-struct float4 {
-  float x, y, z, w;
-  __host__ __device__ float4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-struct double1 {
-  double x;
-  __host__ __device__ double1(double x = 0) : x(x) {}
-};
-struct double2 {
-  double x, y;
-  __host__ __device__ double2(double x = 0, double y = 0) : x(x), y(y) {}
-};
-struct double4 {
-  double x, y, z, w;
-  __host__ __device__ double4(double x = 0, double y = 0, double z = 0, double w = 0) : x(x), y(y), z(z), w(w) {}
-};
-
-typedef unsigned long long cudaTextureObject_t;
-typedef unsigned long long cudaSurfaceObject_t;
-
-enum cudaTextureReadMode {
-  cudaReadModeNormalizedFloat,
-  cudaReadModeElementType
-};
-
-enum cudaSurfaceBoundaryMode {
-  cudaBoundaryModeZero,
-  cudaBoundaryModeClamp,
-  cudaBoundaryModeTrap
-};
-
-enum {
-  cudaTextureType1D,
-  cudaTextureType2D,
-  cudaTextureType3D,
-  cudaTextureTypeCubemap,
-  cudaTextureType1DLayered,
-  cudaTextureType2DLayered,
-  cudaTextureTypeCubemapLayered
-};
-
-struct textureReference { };
-template <class T, int texType = cudaTextureType1D,
-          enum cudaTextureReadMode mode = cudaReadModeElementType>
-struct __attribute__((device_builtin_texture_type)) texture
-    : public textureReference {};
-
-struct surfaceReference { int desc; };
-
-template <typename T, int dim = 1>
-struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu b/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
deleted file mode 100644
index db0260b9bc5d8..0000000000000
--- a/clang-tools-extra/test/clang-tidy/infrastructure/basic-cuda.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
-// RUN:   --cuda-host-only | FileCheck %s
-// RUN: clang-tidy %s -checks='-*,modernize-use-nullptr' -- \
-// RUN:   --cuda-path=%S/Inputs/CUDA \
-// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA \
-// RUN:   --cuda-device-only | FileCheck %s
-
-#include <cuda_runtime.h>
-
-// CHECK: :[[@LINE+1]]:38: warning: use nullptr [modernize-use-nullptr]
-__global__ void kernel(int *p) { p = 0; }
-
-// CHECK: :[[@LINE+1]]:11: warning: use nullptr [modernize-use-nullptr]
-void *p = 0;
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/cuda-basic.cu b/clang-tools-extra/test/clang-tidy/infrastructure/cuda-basic.cu
new file mode 100644
index 0000000000000..fda58da96db83
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/cuda-basic.cu
@@ -0,0 +1,13 @@
+// RUN: %check_clang_tidy %s modernize-use-nullptr %t -- -- \
+// RUN:   --cuda-path=%S/Inputs/CUDA \
+// RUN:   -nocudalib -nocudainc -I %S/Inputs/CUDA
+
+#include <cuda.h>
+
+__global__ void kernel(int *p) { p = 0; }
+// CHECK-MESSAGES: :[[@LINE-1]]:38: warning: use nullptr [modernize-use-nullptr]
+// CHECK-FIXES: __global__ void kernel(int *p) { p = nullptr; }
+
+void *p = 0;
+// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: use nullptr [modernize-use-nullptr]
+// CHECK-FIXES: void *p = nullptr;