[clang] [CIR][CUDA] Upstream device stub mangling (PR #177790)

Sat Jan 24 11:24:26 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: David Rivera (RiverDave)

<details>
<summary>Changes</summary>

Note that this is the start of a series of incremental patches; Therefore the stub body is empty for now as it requires a portion of the actual CUDA runtime implementation to be deferred for a later PR.



---
Full diff: https://github.com/llvm/llvm-project/pull/177790.diff


4 Files Affected:

- (modified) clang/lib/CIR/CodeGen/CIRGenFunction.cpp (+3-1) 
- (modified) clang/lib/CIR/CodeGen/CIRGenModule.cpp (+8-2) 
- (added) clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu (+22) 
- (added) clang/test/CIR/CodeGen/inputs/cuda.h (+74) 


``````````diff

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index f2d73720a9c2b..4c212b06019ea 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -748,7 +748,9 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
       emitConstructorBody(args);
     } else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
                funcDecl->hasAttr<CUDAGlobalAttr>()) {
-      getCIRGenModule().errorNYI(bodyRange, "CUDA kernel");
+      // TODO(cir): Emit device stub body with kernel launch runtime calls
+      // (emitDeviceStub). For now, emit an empty stub.
+      assert(!cir::MissingFeatures::cudaSupport());
     } else if (isa<CXXMethodDecl>(funcDecl) &&
                cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) {
       // The lambda static invoker function is special, because it forwards or
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 61d84f197e6ec..b535eab913a5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1772,9 +1772,15 @@ static std::string getMangledNameImpl(CIRGenModule &cgm, GlobalDecl gd,
       cgm.errorNYI(nd->getSourceRange(), "getMangledName: X86RegCall");
     } else if (fd && fd->hasAttr<CUDAGlobalAttr>() &&
                gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
-      cgm.errorNYI(nd->getSourceRange(), "getMangledName: CUDA device stub");
+      out << "__device_stub__" << ii->getName();
+    } else if (fd &&
+               DeviceKernelAttr::isOpenCLSpelling(
+                   fd->getAttr<DeviceKernelAttr>()) &&
+               gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
+      cgm.errorNYI(nd->getSourceRange(), "getMangledName: OpenCL Stub");
+    } else {
+      out << ii->getName();
     }
-    out << ii->getName();
   }
 
   // Check if the module name hash should be appended for internal linkage
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
new file mode 100644
index 0000000000000..6d5efb69827e3
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -0,0 +1,22 @@
+// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
+// RUN:   -x cuda -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+#include "../inputs/cuda.h"
+
+// CHECK: cir.func {{.*}} @__device_stub__ckernel()
+// CHECK-NEXT:   cir.return
+// CHECK-NEXT: }
+extern "C" __global__ void ckernel() {}
+
+// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv()
+namespace ns {
+__global__ void nskernel() {}
+} // namespace ns
+
+// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv()
+template <class T>
+__global__ void kernelfunc() {}
+template __global__ void kernelfunc<int>();
diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h b/clang/test/CIR/CodeGen/inputs/cuda.h
new file mode 100644
index 0000000000000..204bf2972088d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/inputs/cuda.h
@@ -0,0 +1,74 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+/* From test/CodeGenCUDA/Inputs/cuda.h. */
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+
+#endif
+
+extern "C" __device__ int printf(const char*, ...);

``````````

</details>


https://github.com/llvm/llvm-project/pull/177790