[clang] [CIR][CUDA] Upstream device stub body emission and name mangling (PR #177790)
David Rivera via cfe-commits
cfe-commits at lists.llvm.org
Fri Jan 30 10:08:43 PST 2026
https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/177790
>From f703a61ff33f2d8026cf4bece0416afc331e4dbf Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Sat, 24 Jan 2026 14:12:48 -0500
Subject: [PATCH 1/8] [CIR][CUDA] Upstream device stub mangling
---
clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 4 +-
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 10 ++-
.../test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 22 ++++++
clang/test/CIR/CodeGen/inputs/cuda.h | 74 +++++++++++++++++++
4 files changed, 107 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
create mode 100644 clang/test/CIR/CodeGen/inputs/cuda.h
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index f2d73720a9c2b..4c212b06019ea 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -748,7 +748,9 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
emitConstructorBody(args);
} else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
funcDecl->hasAttr<CUDAGlobalAttr>()) {
- getCIRGenModule().errorNYI(bodyRange, "CUDA kernel");
+ // TODO(cir): Emit device stub body with kernel launch runtime calls
+ // (emitDeviceStub). For now, emit an empty stub.
+ assert(!cir::MissingFeatures::cudaSupport());
} else if (isa<CXXMethodDecl>(funcDecl) &&
cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) {
// The lambda static invoker function is special, because it forwards or
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 61d84f197e6ec..b535eab913a5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1772,9 +1772,15 @@ static std::string getMangledNameImpl(CIRGenModule &cgm, GlobalDecl gd,
cgm.errorNYI(nd->getSourceRange(), "getMangledName: X86RegCall");
} else if (fd && fd->hasAttr<CUDAGlobalAttr>() &&
gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
- cgm.errorNYI(nd->getSourceRange(), "getMangledName: CUDA device stub");
+ out << "__device_stub__" << ii->getName();
+ } else if (fd &&
+ DeviceKernelAttr::isOpenCLSpelling(
+ fd->getAttr<DeviceKernelAttr>()) &&
+ gd.getKernelReferenceKind() == KernelReferenceKind::Stub) {
+ cgm.errorNYI(nd->getSourceRange(), "getMangledName: OpenCL Stub");
+ } else {
+ out << ii->getName();
}
- out << ii->getName();
}
// Check if the module name hash should be appended for internal linkage
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
new file mode 100644
index 0000000000000..6d5efb69827e3
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -0,0 +1,22 @@
+// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
+// RUN: -x cuda -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+#include "../inputs/cuda.h"
+
+// CHECK: cir.func {{.*}} @__device_stub__ckernel()
+// CHECK-NEXT: cir.return
+// CHECK-NEXT: }
+extern "C" __global__ void ckernel() {}
+
+// CHECK: cir.func {{.*}} @_ZN2ns23__device_stub__nskernelEv()
+namespace ns {
+__global__ void nskernel() {}
+} // namespace ns
+
+// CHECK: cir.func {{.*}} @_Z25__device_stub__kernelfuncIiEvv()
+template <class T>
+__global__ void kernelfunc() {}
+template __global__ void kernelfunc<int>();
diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h b/clang/test/CIR/CodeGen/inputs/cuda.h
new file mode 100644
index 0000000000000..204bf2972088d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/inputs/cuda.h
@@ -0,0 +1,74 @@
+/* Minimal declarations for CUDA support. Testing purposes only. */
+/* From test/CodeGenCUDA/Inputs/cuda.h. */
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#endif
+
+struct dim3 {
+ unsigned x, y, z;
+ __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+ hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+ dim3 blockDim, void **args,
+ size_t sharedMem,
+ hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+ dim3 blockDim, void **args,
+ size_t sharedMem,
+ hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+ size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+ dim3 blockDim, void **args,
+ size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+ dim3 blockDim, void **args,
+ size_t sharedMem, cudaStream_t stream);
+
+#endif
+
+extern "C" __device__ int printf(const char*, ...);
>From 1892d27374892cb04af492c4cc63a9129056d257 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Sat, 24 Jan 2026 14:39:09 -0500
Subject: [PATCH 2/8] make test include cleaner
---
clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
index 6d5efb69827e3..da2dbd9843c7c 100644
--- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -1,10 +1,10 @@
// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
-// RUN: -x cuda -o %t.cir
+// RUN: -I%S/../inputs/ -x cuda -o %t.cir
// RUN: FileCheck --input-file=%t.cir %s
-#include "../inputs/cuda.h"
+#include "cuda.h"
// CHECK: cir.func {{.*}} @__device_stub__ckernel()
// CHECK-NEXT: cir.return
>From 601edf35ccd87c52d5319922238ebb3196265a6c Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Tue, 27 Jan 2026 15:59:09 -0500
Subject: [PATCH 3/8] [CIR][CUDA][HIP] Implement stub body emission
---
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 358 ++++++++++++++++++
clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp | 20 +
clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h | 50 +++
clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 4 +-
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 14 +-
clang/lib/CIR/CodeGen/CIRGenModule.h | 9 +
clang/lib/CIR/CodeGen/CMakeLists.txt | 2 +
clang/test/CIR/CodeGen/CUDA/kernel-call.cu | 18 +
.../test/CIR/CodeGen/CUDA/kernel-stub-name.cu | 8 +-
clang/test/CIR/CodeGen/inputs/cuda.h | 6 +
10 files changed, 481 insertions(+), 8 deletions(-)
create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
create mode 100644 clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
create mode 100644 clang/test/CIR/CodeGen/CUDA/kernel-call.cu
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
new file mode 100644
index 0000000000000..acdc811b7a308
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -0,0 +1,358 @@
+//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation. Concrete
+// subclasses of this implement code generation for specific OpenCL
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "mlir/IR/Operation.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/GlobalDecl.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+
+class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
+protected:
+ StringRef Prefix;
+
+ // Map a device stub function to a symbol for identifying kernel in host
+ // code. For CUDA, the symbol for identifying the kernel is the same as the
+ // device stub function. For HIP, they are different.
+ llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles;
+
+ // Map a kernel handle to the kernel stub.
+ llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
+ // Mangle context for device.
+ std::unique_ptr<MangleContext> deviceMC;
+
+private:
+ void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args);
+ mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
+ FunctionArgList &args);
+ mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
+ std::string addPrefixToName(StringRef funcName) const;
+ std::string addUnderscoredPrefixToName(StringRef funcName) const;
+
+public:
+ CIRGenNVCUDARuntime(CIRGenModule &cgm);
+ ~CIRGenNVCUDARuntime();
+
+ void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args) override;
+};
+
+} // namespace
+
+std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
+ return (Prefix + funcName).str();
+}
+
+std::string
+CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
+ return ("__" + Prefix + funcName).str();
+}
+
+static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
+ // If the host and device have different C++ ABIs, mark it as the device
+ // mangle context so that the mangling needs to retrieve the additional
+ // device lambda mangling number instead of the regular host one.
+ if (cgm.getASTContext().getAuxTargetInfo() &&
+ cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+ cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+ return std::unique_ptr<MangleContext>(
+ cgm.getASTContext().createDeviceMangleContext(
+ *cgm.getASTContext().getAuxTargetInfo()));
+ }
+
+ return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
+ cgm.getASTContext().getAuxTargetInfo()));
+}
+
+CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
+ : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
+ if (cgm.getLangOpts().OffloadViaLLVM)
+ llvm_unreachable("NYI");
+ else if (cgm.getLangOpts().HIP)
+ Prefix = "hip";
+ else
+ Prefix = "cuda";
+}
+
+mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
+ mlir::Location loc,
+ FunctionArgList &args) {
+ auto &builder = cgm.getBuilder();
+
+ // Build void *args[] and populate with the addresses of kernel arguments.
+ auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
+ mlir::Value kernelArgs = builder.createAlloca(
+ loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
+ CharUnits::fromQuantity(16));
+
+ mlir::Value kernelArgsDecayed =
+ builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
+ cir::PointerType::get(cgm.voidPtrTy));
+
+ for (auto [i, arg] : llvm::enumerate(args)) {
+ mlir::Value index =
+ builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
+ mlir::Value storePos =
+ builder.createPtrStride(loc, kernelArgsDecayed, index);
+
+ // Get the address of the argument and cast the store destination to match
+ // its pointer-to-pointer type. This is needed because upstream's
+ // createStore doesn't auto-bitcast like the incubator version.
+ mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
+ mlir::Value storePosTyped = builder.createBitcast(
+ storePos, cir::PointerType::get(argAddr.getType()));
+
+ builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped);
+ }
+
+ return kernelArgsDecayed;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
+ cir::FuncOp fn,
+ FunctionArgList &args) {
+
+ // This requires arguments to be sent to kernels in a different way.
+ if (cgm.getLangOpts().OffloadViaLLVM)
+ cgm.errorNYI("Offload via LLVM");
+
+ auto &builder = cgm.getBuilder();
+ auto loc = fn.getLoc();
+
+ // For [cuda|hip]LaunchKernel, we must add another layer of indirection
+ // to arguments. For example, for function `add(int a, float b)`,
+ // we need to pass it as `void *args[2] = { &a, &b }`.
+ mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
+
+ // Lookup cudaLaunchKernel/hipLaunchKernel function.
+ // HIP kernel launching API name depends on -fgpu-default-stream option. For
+ // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+ // it is hipLaunchKernel_spt.
+ // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ // void **args, size_t sharedMem,
+ // cudaStream_t stream);
+ // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+ // dim3 blockDim, void **args,
+ // size_t sharedMem, hipStream_t stream);
+ TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
+ DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
+
+ // The default stream is usually stream 0 (the legacy default stream).
+ // For per-thread default stream, we need a different LaunchKernel function.
+ std::string kernelLaunchAPI = "LaunchKernel";
+ if (cgm.getLangOpts().GPUDefaultStream ==
+ LangOptions::GPUDefaultStreamKind::PerThread)
+ cgm.errorNYI("CUDA/HIP Stream per thread");
+
+ std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
+ const IdentifierInfo &launchII =
+ cgm.getASTContext().Idents.get(launchKernelName);
+ FunctionDecl *cudaLaunchKernelFD = nullptr;
+ for (auto *result : dc->lookup(&launchII)) {
+ if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
+ cudaLaunchKernelFD = fd;
+ }
+
+ if (cudaLaunchKernelFD == nullptr) {
+ cgm.error(cgf.curFuncDecl->getLocation(),
+ "Can't find declaration for " + launchKernelName);
+ return;
+ }
+
+ // Use this function to retrieve arguments for cudaLaunchKernel:
+ // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
+ // *sharedMem, cudaStream_t *stream)
+ //
+ // Here [cuda|hip]Stream_t, while also being the 6th argument of
+ // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
+
+ mlir::Type dim3Ty = cgf.getTypes().convertType(
+ cudaLaunchKernelFD->getParamDecl(1)->getType());
+ mlir::Type streamTy = cgf.getTypes().convertType(
+ cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+ mlir::Value gridDim =
+ builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+ "grid_dim", CharUnits::fromQuantity(8));
+ mlir::Value blockDim =
+ builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
+ "block_dim", CharUnits::fromQuantity(8));
+ mlir::Value sharedMem =
+ builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
+ "shared_mem", cgm.getSizeAlign());
+ mlir::Value stream =
+ builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
+ "stream", cgm.getPointerAlign());
+
+ cir::FuncOp popConfig = cgm.createRuntimeFunction(
+ cir::FuncType::get({gridDim.getType(), blockDim.getType(),
+ sharedMem.getType(), stream.getType()},
+ cgm.sInt32Ty),
+ addUnderscoredPrefixToName("PopCallConfiguration"));
+ cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
+
+ // Now emit the call to cudaLaunchKernel
+ // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
+ // dim3 blockDim,
+ // void **args, size_t sharedMem,
+ // [cuda|hip]Stream_t stream);
+
+ // We now either pick the function or the stub global for cuda, hip
+ // resepectively.
+ auto kernel = [&]() {
+ if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+ kernelHandles[fn.getSymName()])) {
+ auto kernelTy = cir::PointerType::get(globalOp.getSymType());
+ mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy,
+ globalOp.getSymName());
+ return kernel;
+ }
+ if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+ kernelHandles[fn.getSymName()])) {
+ auto kernelTy = cir::PointerType::get(funcOp.getFunctionType());
+ mlir::Value kernel =
+ cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
+ mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy);
+ return func;
+ }
+ assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+ }();
+
+ CallArgList launchArgs;
+ launchArgs.add(RValue::get(kernel),
+ cudaLaunchKernelFD->getParamDecl(0)->getType());
+ launchArgs.add(
+ RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
+ cudaLaunchKernelFD->getParamDecl(1)->getType());
+ launchArgs.add(
+ RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
+ cudaLaunchKernelFD->getParamDecl(2)->getType());
+ launchArgs.add(RValue::get(kernelArgs),
+ cudaLaunchKernelFD->getParamDecl(3)->getType());
+ launchArgs.add(
+ RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
+ cudaLaunchKernelFD->getParamDecl(4)->getType());
+ launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
+ cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+ mlir::Type launchTy =
+ cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
+ mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
+ cast<cir::FuncType>(launchTy), launchKernelName);
+ const auto &callInfo =
+ cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+ cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
+ ReturnValueSlot(), launchArgs);
+
+ if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+ !cgf.getLangOpts().HIP)
+ cgm.errorNYI("MSVC CUDA stub handling");
+}
+
+void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args) {
+
+ if (auto globalOp =
+ llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
+ auto &builder = cgm.getBuilder();
+ auto fnPtrTy = globalOp.getSymType();
+ auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
+ auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
+
+ globalOp->setAttr("initial_value", gv);
+ globalOp->removeAttr("sym_visibility");
+ globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+ cgm.getPointerAlign().getQuantity()));
+ }
+
+ // CUDA 9.0 changed the way to launch kernels.
+ if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
+ CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+ (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
+ cgm.getLangOpts().OffloadViaLLVM)
+ emitDeviceStubBodyNew(cgf, fn, args);
+ else
+ cgm.errorNYI("Emit Stub Body Legacy");
+}
+
+CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
+ return new CIRGenNVCUDARuntime(cgm);
+}
+
+CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
+
+mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
+ GlobalDecl GD) {
+
+ // Check if we already have a kernel handle for this function
+ auto Loc = kernelHandles.find(fn.getSymName());
+ if (Loc != kernelHandles.end()) {
+ auto OldHandle = Loc->second;
+ // Here we know that the fn did not change. Return it
+ if (kernelStubs[OldHandle] == fn)
+ return OldHandle;
+
+ // We've found the function name, but F itself has changed, so we need to
+ // update the references.
+ if (cgm.getLangOpts().HIP) {
+ // For HIP compilation the handle itself does not change, so we only need
+ // to update the Stub value.
+ kernelStubs[OldHandle] = fn;
+ return OldHandle;
+ }
+ // For non-HIP compilation, erase the old Stub and fall-through to creating
+ // new entries.
+ kernelStubs.erase(OldHandle);
+ }
+
+ // If not targeting HIP, store the function itself
+ if (!cgm.getLangOpts().HIP) {
+ kernelHandles[fn.getSymName()] = fn;
+ kernelStubs[fn] = fn;
+ return fn;
+ }
+
+ // Create a new CIR global variable to represent the kernel handle
+ auto &builder = cgm.getBuilder();
+ auto globalName = cgm.getMangledName(
+ GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+ const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(GD.getDecl());
+ cir::GlobalOp globalOp =
+ cgm.getOrCreateCIRGlobal(globalName, fn.getFunctionType().getReturnType(),
+ LangAS::Default, varDecl, NotForDefinition);
+
+ globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+ cgm.getPointerAlign().getQuantity()));
+
+ // Store references
+ kernelHandles[fn.getSymName()] = globalOp;
+ kernelStubs[globalOp] = fn;
+
+ return globalOp;
+}
\ No newline at end of file
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
new file mode 100644
index 0000000000000..c438c968c24ce
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -0,0 +1,20 @@
+//===----- CIRGenCUDARuntime.cpp - Interface to CUDA Runtimes -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation. Concrete
+// subclasses of this implement code generation for specific CUDA
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCUDARuntime.h"
+
+using namespace clang;
+using namespace CIRGen;
+
+CIRGenCUDARuntime::~CIRGenCUDARuntime() {}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
new file mode 100644
index 0000000000000..a0809c1d185b8
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
@@ -0,0 +1,50 @@
+//===------ CIRGenCUDARuntime.h - Interface to CUDA Runtimes -----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for CUDA CIR generation. Concrete
+// subclasses of this implement code generation for specific OpenCL
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
+#define LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
+
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+
+namespace clang {
+class CUDAKernelCallExpr;
+}
+
+namespace clang::CIRGen {
+
+class CIRGenFunction;
+class CIRGenModule;
+class FunctionArgList;
+class RValue;
+class ReturnValueSlot;
+
+class CIRGenCUDARuntime {
+protected:
+ CIRGenModule &cgm;
+
+public:
+ CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {}
+ virtual ~CIRGenCUDARuntime();
+
+ virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
+ FunctionArgList &args) = 0;
+
+ virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) = 0;
+};
+
+CIRGenCUDARuntime *createNVCUDARuntime(CIRGenModule &cgm);
+
+} // namespace clang::CIRGen
+
+#endif // LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 4c212b06019ea..c900797e54c81 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -748,9 +748,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
emitConstructorBody(args);
} else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
funcDecl->hasAttr<CUDAGlobalAttr>()) {
- // TODO(cir): Emit device stub body with kernel launch runtime calls
- // (emitDeviceStub). For now, emit an empty stub.
- assert(!cir::MissingFeatures::cudaSupport());
+ cgm.getCUDARuntime().emitDeviceStub(*this, fn, args);
} else if (isa<CXXMethodDecl>(funcDecl) &&
cast<CXXMethodDecl>(funcDecl)->isLambdaStaticInvoker()) {
// The lambda static invoker function is special, because it forwards or
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index b535eab913a5d..8cef5408bbfc1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "CIRGenModule.h"
+#include "CIRGenCUDARuntime.h"
#include "CIRGenCXXABI.h"
#include "CIRGenConstantEmitter.h"
#include "CIRGenFunction.h"
@@ -31,6 +32,7 @@
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
#include "mlir/IR/Verifier.h"
#include <algorithm>
@@ -68,7 +70,8 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
langOpts(astContext.getLangOpts()), codeGenOpts(cgo),
theModule{mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))},
diags(diags), target(astContext.getTargetInfo()),
- abi(createCXXABI(*this)), genTypes(*this), vtables(*this) {
+ abi(createCXXABI(*this)), genTypes(*this), vtables(*this),
+ cudaRuntime(clang::CIRGen::createNVCUDARuntime((*this))) {
// Initialize cached types
voidTy = cir::VoidType::get(&getMLIRContext());
@@ -1748,6 +1751,15 @@ cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
cir::FuncOp func =
getOrCreateCIRFunction(mangledName, funcType, gd, forVTable, dontDefer,
/*isThunk=*/false, isForDefinition);
+ // Returns kernel handle for HIP kernel stub function.
+ if (langOpts.CUDA && !langOpts.CUDAIsDevice &&
+ cast<FunctionDecl>(gd.getDecl())->hasAttr<CUDAGlobalAttr>()) {
+ mlir::Operation *handle = getCUDARuntime().getKernelHandle(func, gd);
+
+ if (isForDefinition)
+ return func;
+ return mlir::dyn_cast<cir::FuncOp>(*handle);
+ }
return func;
}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 3c4f35bacc4f9..6f301bd83d373 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -15,6 +15,7 @@
#include "CIRGenBuilder.h"
#include "CIRGenCall.h"
+#include "CIRGenCUDARuntime.h"
#include "CIRGenTypeCache.h"
#include "CIRGenTypes.h"
#include "CIRGenVTables.h"
@@ -90,6 +91,9 @@ class CIRGenModule : public CIRGenTypeCache {
/// Holds information about C++ vtables.
CIRGenVTables vtables;
+ /// Holds the CUDA runtime
+ std::unique_ptr<CIRGenCUDARuntime> cudaRuntime;
+
/// Per-function codegen information. Updated everytime emitCIR is called
/// for FunctionDecls's.
CIRGenFunction *curCGF = nullptr;
@@ -593,6 +597,11 @@ class CIRGenModule : public CIRGenTypeCache {
/// Function* for "fabsf".
cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID);
+ CIRGenCUDARuntime &getCUDARuntime() {
+ assert(cudaRuntime != nullptr);
+ return *cudaRuntime;
+ }
+
mlir::IntegerAttr getSize(CharUnits size) {
return builder.getSizeFromCharUnits(size);
}
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 8efa587f31aac..ff5e666a72bef 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -18,6 +18,8 @@ add_clang_library(clangCIR
CIRGenClass.cpp
CIRGenCleanup.cpp
CIRGenCoroutine.cpp
+ CIRGenCUDANV.cpp
+ CIRGenCUDARuntime.cpp
CIRGenCXX.cpp
CIRGenCXXABI.cpp
CIRGenDecl.cpp
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
new file mode 100644
index 0000000000000..d1dae134b0230
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
@@ -0,0 +1,18 @@
+// Based on clang/test/CodeGenCUDA/kernel-call.cu.
+// Tests device stub body emission for CUDA kernels.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN: -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CUDA-NEW
+
+
+#include "cuda.h"
+
+
+// TODO: Test CUDA legacy (< 9.0) when legacy stub body is implemented
+// TODO: Test HIP when HIP stub body support is complete
+
+// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelv
+// CUDA-NEW: cir.call @__cudaPopCallConfiguration
+// CUDA-NEW: cir.call @cudaLaunchKernel
+__global__ void kernel() {}
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
index da2dbd9843c7c..0edf256ccf961 100644
--- a/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-stub-name.cu
@@ -1,13 +1,13 @@
// Based on clang/test/CodeGenCUDA/kernel-stub-name.cu.
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s \
-// RUN: -I%S/../inputs/ -x cuda -o %t.cir
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN: -emit-cir %s -I%S/../inputs/ -x cuda -o %t.cir
// RUN: FileCheck --input-file=%t.cir %s
#include "cuda.h"
-// CHECK: cir.func {{.*}} @__device_stub__ckernel()
-// CHECK-NEXT: cir.return
+// CHECK: cir.func {{.*}} @[[CSTUB:__device_stub__ckernel]]()
+// CHECK: cir.return
// CHECK-NEXT: }
extern "C" __global__ void ckernel() {}
diff --git a/clang/test/CIR/CodeGen/inputs/cuda.h b/clang/test/CIR/CodeGen/inputs/cuda.h
index 204bf2972088d..225c7dfdcf0db 100644
--- a/clang/test/CIR/CodeGen/inputs/cuda.h
+++ b/clang/test/CIR/CodeGen/inputs/cuda.h
@@ -37,6 +37,9 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
size_t sharedSize = 0,
hipStream_t stream = 0);
+extern "C" int __hipPopCallConfiguration(dim3 *gridSize, dim3 *blockSize,
+ size_t *sharedSize,
+ hipStream_t *stream);
#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
@@ -62,6 +65,9 @@ extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
size_t sharedSize = 0,
cudaStream_t stream = 0);
+extern "C" int __cudaPopCallConfiguration(dim3 *gridSize, dim3 *blockSize,
+ size_t *sharedSize,
+ cudaStream_t *stream);
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
size_t sharedMem, cudaStream_t stream);
>From 4509e1ce33324ab380eca5c27e8af8385f6e1a38 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Tue, 27 Jan 2026 16:01:55 -0500
Subject: [PATCH 4/8] fix fmt
---
clang/lib/CIR/CodeGen/CIRGenModule.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 6f301bd83d373..9b12a5fe26e04 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -14,8 +14,8 @@
#define LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H
#include "CIRGenBuilder.h"
-#include "CIRGenCall.h"
#include "CIRGenCUDARuntime.h"
+#include "CIRGenCall.h"
#include "CIRGenTypeCache.h"
#include "CIRGenTypes.h"
#include "CIRGenVTables.h"
>From b260ed6dcade871d9e7bbaf92fda49eff3e61fbb Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Tue, 27 Jan 2026 16:05:09 -0500
Subject: [PATCH 5/8] nit: parity with og on runtime headers
---
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 5 ++---
clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp | 2 +-
clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h | 4 ++--
3 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index acdc811b7a308..9a6eaafcbd439 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -6,9 +6,8 @@
//
//===----------------------------------------------------------------------===//
//
-// This provides an abstract class for CUDA CIR generation. Concrete
-// subclasses of this implement code generation for specific OpenCL
-// runtime libraries.
+// This provides a class for CUDA code generation targeting the NVIDIA CUDA
+// runtime library.
//
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
index c438c968c24ce..14189ad7a52f3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This provides an abstract class for CUDA CIR generation. Concrete
+// This provides an abstract class for CUDA code generation. Concrete
// subclasses of this implement code generation for specific CUDA
// runtime libraries.
//
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
index a0809c1d185b8..83eb0c02188ba 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
//
-// This provides an abstract class for CUDA CIR generation. Concrete
-// subclasses of this implement code generation for specific OpenCL
+// This provides an abstract class for CUDA code generation. Concrete
+// subclasses of this implement code generation for specific CUDA
// runtime libraries.
//
//===----------------------------------------------------------------------===//
>From 8e3701e1e3605ff48f366b82b91ffb2dc10a6e32 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 30 Jan 2026 13:04:17 -0500
Subject: [PATCH 6/8] address comments and adapt a bunch of lines to proper
coding standards
---
clang/include/clang/AST/ASTContext.h | 2 +
clang/lib/AST/ASTContext.cpp | 12 +++
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 116 +++++++++------------
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 11 +-
clang/lib/CIR/CodeGen/CIRGenModule.h | 2 +
clang/test/CIR/CodeGen/CUDA/kernel-call.cu | 40 ++++++-
6 files changed, 111 insertions(+), 72 deletions(-)
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 68205dd1c1fd9..c8d6de1689512 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2883,6 +2883,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// (from the AuxTargetInfo) is a an itanium target.
MangleContext *createDeviceMangleContext(const TargetInfo &T);
+ MangleContext *cudaNVInitDeviceMC();
+
void DeepCollectObjCIvars(const ObjCInterfaceDecl *OI, bool leafClass,
SmallVectorImpl<const ObjCIvarDecl*> &Ivars) const;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index f52470a4d7458..3f63420cae91e 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13224,6 +13224,18 @@ MangleContext *ASTContext::createDeviceMangleContext(const TargetInfo &T) {
llvm_unreachable("Unsupported ABI");
}
+MangleContext *ASTContext::cudaNVInitDeviceMC() {
+ // If the host and device have different C++ ABIs, mark it as the device
+ // mangle context so that the mangling needs to retrieve the additional
+ // device lambda mangling number instead of the regular host one.
+ if (getAuxTargetInfo() && getTargetInfo().getCXXABI().isMicrosoft() &&
+ getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+ return createDeviceMangleContext(*getAuxTargetInfo());
+ }
+
+ return createMangleContext(getAuxTargetInfo());
+}
+
CXXABI::~CXXABI() = default;
size_t ASTContext::getSideTableAllocatedMemory() const {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 9a6eaafcbd439..434c8003af27c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -1,4 +1,4 @@
-//===- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----===//
+//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -31,12 +31,12 @@ namespace {
class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
protected:
- StringRef Prefix;
+ StringRef prefix;
// Map a device stub function to a symbol for identifying kernel in host
// code. For CUDA, the symbol for identifying the kernel is the same as the
// device stub function. For HIP, they are different.
- llvm::DenseMap<StringRef, mlir::Operation *> kernelHandles;
+ llvm::StringMap<mlir::Operation *> kernelHandles;
// Map a kernel handle to the kernel stub.
llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
@@ -63,44 +63,29 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
} // namespace
std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
- return (Prefix + funcName).str();
+ return (prefix + funcName).str();
}
std::string
CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
- return ("__" + Prefix + funcName).str();
-}
-
-static std::unique_ptr<MangleContext> initDeviceMC(CIRGenModule &cgm) {
- // If the host and device have different C++ ABIs, mark it as the device
- // mangle context so that the mangling needs to retrieve the additional
- // device lambda mangling number instead of the regular host one.
- if (cgm.getASTContext().getAuxTargetInfo() &&
- cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
- cgm.getASTContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
- return std::unique_ptr<MangleContext>(
- cgm.getASTContext().createDeviceMangleContext(
- *cgm.getASTContext().getAuxTargetInfo()));
- }
-
- return std::unique_ptr<MangleContext>(cgm.getASTContext().createMangleContext(
- cgm.getASTContext().getAuxTargetInfo()));
+ return ("__" + prefix + funcName).str();
}
CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
- : CIRGenCUDARuntime(cgm), deviceMC(initDeviceMC(cgm)) {
+ : CIRGenCUDARuntime(cgm),
+ deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
if (cgm.getLangOpts().OffloadViaLLVM)
llvm_unreachable("NYI");
else if (cgm.getLangOpts().HIP)
- Prefix = "hip";
+ prefix = "hip";
else
- Prefix = "cuda";
+ prefix = "cuda";
}
mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
mlir::Location loc,
FunctionArgList &args) {
- auto &builder = cgm.getBuilder();
+ CIRGenBuilderTy &builder = cgm.getBuilder();
// Build void *args[] and populate with the addresses of kernel arguments.
auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
@@ -112,20 +97,15 @@ mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
cir::PointerType::get(cgm.voidPtrTy));
- for (auto [i, arg] : llvm::enumerate(args)) {
+ for (const auto &[i, arg] : llvm::enumerate(args)) {
mlir::Value index =
builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
mlir::Value storePos =
builder.createPtrStride(loc, kernelArgsDecayed, index);
-
- // Get the address of the argument and cast the store destination to match
- // its pointer-to-pointer type. This is needed because upstream's
- // createStore doesn't auto-bitcast like the incubator version.
mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
- mlir::Value storePosTyped = builder.createBitcast(
- storePos, cir::PointerType::get(argAddr.getType()));
+ mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy);
- builder.CIRBaseBuilderTy::createStore(loc, argAddr, storePosTyped);
+ builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos);
}
return kernelArgsDecayed;
@@ -139,10 +119,13 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
// This requires arguments to be sent to kernels in a different way.
if (cgm.getLangOpts().OffloadViaLLVM)
- cgm.errorNYI("Offload via LLVM");
+ cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
+
+ if (cgm.getLangOpts().HIP)
+ cgm.errorNYI("CIRGenNVCUDARuntime: HIP Support");
- auto &builder = cgm.getBuilder();
- auto loc = fn.getLoc();
+ CIRGenBuilderTy &builder = cgm.getBuilder();
+ mlir::Location loc = fn.getLoc();
// For [cuda|hip]LaunchKernel, we must add another layer of indirection
// to arguments. For example, for function `add(int a, float b)`,
@@ -164,7 +147,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
// The default stream is usually stream 0 (the legacy default stream).
// For per-thread default stream, we need a different LaunchKernel function.
- std::string kernelLaunchAPI = "LaunchKernel";
+ StringRef kernelLaunchAPI = "LaunchKernel";
if (cgm.getLangOpts().GPUDefaultStream ==
LangOptions::GPUDefaultStreamKind::PerThread)
cgm.errorNYI("CUDA/HIP Stream per thread");
@@ -173,7 +156,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
const IdentifierInfo &launchII =
cgm.getASTContext().Idents.get(launchKernelName);
FunctionDecl *cudaLaunchKernelFD = nullptr;
- for (auto *result : dc->lookup(&launchII)) {
+ for (NamedDecl *result : dc->lookup(&launchII)) {
if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
cudaLaunchKernelFD = fd;
}
@@ -223,24 +206,25 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
// [cuda|hip]Stream_t stream);
// We now either pick the function or the stub global for cuda, hip
- // resepectively.
- auto kernel = [&]() {
- if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+ // respectively.
+ mlir::Value kernel = [&]() -> mlir::Value {
+ if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
kernelHandles[fn.getSymName()])) {
- auto kernelTy = cir::PointerType::get(globalOp.getSymType());
- mlir::Value kernel = cir::GetGlobalOp::create(builder, loc, kernelTy,
- globalOp.getSymName());
- return kernel;
+ cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType());
+ mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy,
+ globalOp.getSymName());
+ return kernelVal;
}
- if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+ if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
kernelHandles[fn.getSymName()])) {
- auto kernelTy = cir::PointerType::get(funcOp.getFunctionType());
- mlir::Value kernel =
+ cir::PointerType kernelTy =
+ cir::PointerType::get(funcOp.getFunctionType());
+ mlir::Value kernelVal =
cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
- mlir::Value func = builder.createBitcast(kernel, cgm.voidPtrTy);
+ mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
return func;
}
- assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+ llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp");
}();
CallArgList launchArgs;
@@ -264,7 +248,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
cast<cir::FuncType>(launchTy), launchKernelName);
- const auto &callInfo =
+ const CIRGenFunctionInfo &callInfo =
cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
ReturnValueSlot(), launchArgs);
@@ -279,8 +263,8 @@ void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
if (auto globalOp =
llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
- auto &builder = cgm.getBuilder();
- auto fnPtrTy = globalOp.getSymType();
+ CIRGenBuilderTy &builder = cgm.getBuilder();
+ mlir::Type fnPtrTy = globalOp.getSymType();
auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
@@ -307,27 +291,27 @@ CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
- GlobalDecl GD) {
+ GlobalDecl gd) {
// Check if we already have a kernel handle for this function
- auto Loc = kernelHandles.find(fn.getSymName());
- if (Loc != kernelHandles.end()) {
- auto OldHandle = Loc->second;
+ auto it = kernelHandles.find(fn.getSymName());
+ if (it != kernelHandles.end()) {
+ mlir::Operation *oldHandle = it->second;
// Here we know that the fn did not change. Return it
- if (kernelStubs[OldHandle] == fn)
- return OldHandle;
+ if (kernelStubs[oldHandle] == fn)
+ return oldHandle;
// We've found the function name, but F itself has changed, so we need to
// update the references.
if (cgm.getLangOpts().HIP) {
// For HIP compilation the handle itself does not change, so we only need
// to update the Stub value.
- kernelStubs[OldHandle] = fn;
- return OldHandle;
+ kernelStubs[oldHandle] = fn;
+ return oldHandle;
}
// For non-HIP compilation, erase the old Stub and fall-through to creating
// new entries.
- kernelStubs.erase(OldHandle);
+ kernelStubs.erase(oldHandle);
}
// If not targeting HIP, store the function itself
@@ -338,10 +322,10 @@ mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
}
// Create a new CIR global variable to represent the kernel handle
- auto &builder = cgm.getBuilder();
- auto globalName = cgm.getMangledName(
- GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
- const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(GD.getDecl());
+ CIRGenBuilderTy &builder = cgm.getBuilder();
+ StringRef globalName = cgm.getMangledName(
+ gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+ const VarDecl *varDecl = llvm::dyn_cast_or_null<VarDecl>(gd.getDecl());
cir::GlobalOp globalOp =
cgm.getOrCreateCIRGlobal(globalName, fn.getFunctionType().getReturnType(),
LangAS::Default, varDecl, NotForDefinition);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8cef5408bbfc1..6ce66922deb0f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -70,8 +70,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
langOpts(astContext.getLangOpts()), codeGenOpts(cgo),
theModule{mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))},
diags(diags), target(astContext.getTargetInfo()),
- abi(createCXXABI(*this)), genTypes(*this), vtables(*this),
- cudaRuntime(clang::CIRGen::createNVCUDARuntime((*this))) {
+ abi(createCXXABI(*this)), genTypes(*this), vtables(*this) {
// Initialize cached types
voidTy = cir::VoidType::get(&getMLIRContext());
@@ -129,6 +128,10 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
cir::OptInfoAttr::get(&mlirContext,
cgo.OptimizationLevel,
cgo.OptimizeSize));
+
+ if (langOpts.CUDA)
+ createCUDARuntime();
+
// Set the module name to be the name of the main file. TranslationUnitDecl
// often contains invalid source locations and isn't a reliable source for the
// module location.
@@ -146,6 +149,10 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
CIRGenModule::~CIRGenModule() = default;
+void CIRGenModule::createCUDARuntime() {
+ cudaRuntime.reset(createNVCUDARuntime(*this));
+}
+
/// FIXME: this could likely be a common helper and not necessarily related
/// with codegen.
/// Return the best known alignment for an unknown pointer to a
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 9b12a5fe26e04..1c2d2f8277fa8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -100,6 +100,8 @@ class CIRGenModule : public CIRGenTypeCache {
llvm::SmallVector<mlir::Attribute> globalScopeAsm;
+ void createCUDARuntime();
+
public:
mlir::ModuleOp getModule() const { return theModule; }
CIRGenBuilderTy &getBuilder() { return builder; }
diff --git a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
index d1dae134b0230..2fca96fe3926c 100644
--- a/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGen/CUDA/kernel-call.cu
@@ -12,7 +12,39 @@
// TODO: Test CUDA legacy (< 9.0) when legacy stub body is implemented
// TODO: Test HIP when HIP stub body support is complete
-// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelv
-// CUDA-NEW: cir.call @__cudaPopCallConfiguration
-// CUDA-NEW: cir.call @cudaLaunchKernel
-__global__ void kernel() {}
+// Check that the stub function is generated with the correct name
+// CUDA-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelif
+//
+// Check kernel arguments are allocated as local variables
+// CUDA-NEW-DAG: cir.alloca !s32i, {{.*}} ["x", init]
+// CUDA-NEW-DAG: cir.alloca !cir.float, {{.*}} ["y", init]
+//
+// Check void *args[] array is created with correct size (2 args)
+// CUDA-NEW: cir.alloca !cir.array<!cir.ptr<!void> x 2>, {{.*}} ["kernel_args"]
+// CUDA-NEW: cir.cast array_to_ptrdecay
+//
+// Check arguments are stored in the args array via ptr_stride indexing
+// CUDA-NEW: cir.const #cir.int<0>
+// CUDA-NEW: cir.ptr_stride
+// CUDA-NEW: cir.cast bitcast {{.*}} -> !cir.ptr<!void>
+// CUDA-NEW: cir.store {{.*}} !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CUDA-NEW: cir.const #cir.int<1>
+// CUDA-NEW: cir.ptr_stride
+// CUDA-NEW: cir.cast bitcast {{.*}} -> !cir.ptr<!void>
+// CUDA-NEW: cir.store {{.*}} !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+//
+// Check dim3 grid_dim and block_dim allocas for launch configuration
+// CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["grid_dim"]
+// CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["block_dim"]
+//
+// Check shared_mem (size_t) and stream allocas
+// CUDA-NEW-DAG: cir.alloca !u64i, {{.*}} ["shared_mem"]
+// CUDA-NEW-DAG: cir.alloca !cir.ptr<!rec_cudaStream>, {{.*}} ["stream"]
+//
+// Check __cudaPopCallConfiguration is called with correct argument types
+// CUDA-NEW: cir.call @__cudaPopCallConfiguration({{.*}}) : (!cir.ptr<!rec_dim3>, !cir.ptr<!rec_dim3>, !cir.ptr<!u64i>, !cir.ptr<!cir.ptr<!rec_cudaStream>>) -> !s32i
+//
+// Check cudaLaunchKernel is called with all 6 arguments:
+// func ptr, gridDim, blockDim, args, sharedMem, stream
+// CUDA-NEW: cir.call @cudaLaunchKernel({{.*}}) : (!cir.ptr<!void>, !rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>, !u64i, !cir.ptr<!rec_cudaStream>) -> !u32i
+__global__ void kernel(int x, float y) {}
>From 3eddc7c0ef8b0d9173e6577a9528fa1245839cc0 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 30 Jan 2026 13:06:41 -0500
Subject: [PATCH 7/8] nyi on runtime constructor
---
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 434c8003af27c..cb66c33d5ee74 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -75,7 +75,7 @@ CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
: CIRGenCUDARuntime(cgm),
deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
if (cgm.getLangOpts().OffloadViaLLVM)
- llvm_unreachable("NYI");
+ cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
else if (cgm.getLangOpts().HIP)
prefix = "hip";
else
>From f0288c496d62d63448e8d4ddece76d70bfc629e1 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 30 Jan 2026 13:08:25 -0500
Subject: [PATCH 8/8] newlines yo
---
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index cb66c33d5ee74..ad5da0d11ff02 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -338,4 +338,4 @@ mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
kernelStubs[globalOp] = fn;
return globalOp;
-}
\ No newline at end of file
+}
More information about the cfe-commits
mailing list