[clang] [CIR][CUDA] Global emission for fatbin symbols (PR #187636)
David Rivera via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 23 17:46:37 PDT 2026
https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/187636
>From 82324601ab8a9d6798ba9d33c9dca4f57dbbc115 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Wed, 18 Mar 2026 21:10:59 -0400
Subject: [PATCH 1/3] [CIR][CUDA] Global emission for fatbin symbols
---
.../clang/CIR/Dialect/IR/CIRCUDAAttrs.td | 17 ++
.../clang/CIR/Dialect/IR/CIRDialect.td | 1 +
clang/include/clang/CIR/MissingFeatures.h | 2 +
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 10 ++
.../Dialect/Transforms/LoweringPrepare.cpp | 154 ++++++++++++++++++
clang/test/CIR/CodeGenCUDA/device-stub.cu | 50 ++++++
6 files changed, 234 insertions(+)
create mode 100644 clang/test/CIR/CodeGenCUDA/device-stub.cu
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
index 257cf396abce7..e85039a094cf8 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
@@ -48,5 +48,22 @@ def CUDAExternallyInitializedAttr : CIR_Attr<"CUDAExternallyInitialized",
The attribute corresponds to the attribute on LLVM with the same name.
}];
}
+def CIR_CUDABinaryHandleAttr : CIR_Attr<
+ "CUDABinaryHandle", "cu.binary_handle"
+> {
+ let summary = "Fat binary handle for device code.";
+ let description =
+ [{
+ This attribute is attached to the ModuleOp and records the binary file
+ name passed to host.
+
+ CUDA first compiles device-side code into a fat binary file. The file
+ name is then passed into host-side code, which is used to create a handle
+ and then generate various registration functions.
+ }];
+
+ let parameters = (ins "std::string":$name);
+ let assemblyFormat = "`<` $name `>`";
+}
#endif // CLANG_CIR_DIALECT_IR_CIRCUDAATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index f1f94c868e5b0..f14478e36f3c0 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -75,6 +75,7 @@ def CIR_Dialect : Dialect {
static llvm::StringRef getDefaultFuncAttrsAttrName() { return "default_func_attrs"; }
static llvm::StringRef getResAttrsAttrName() { return "res_attrs"; }
static llvm::StringRef getArgAttrsAttrName() { return "arg_attrs"; }
+ static llvm::StringRef getCUDABinaryHandleAttrName() { return "cir.cu.binary_handle"; }
static llvm::StringRef getAMDGPUCodeObjectVersionAttrName() { return "cir.amdhsa_code_object_version"; }
static llvm::StringRef getAMDGPUPrintfKindAttrName() { return "cir.amdgpu_printf_kind"; }
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 68db08a5580ca..c1425e19dfeda 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -245,6 +245,8 @@ struct MissingFeatures {
static bool ctorConstLvalueToRvalueConversion() { return false; }
static bool ctorMemcpyizer() { return false; }
static bool cudaSupport() { return false; }
+ static bool hipModuleCtor() { return false; }
+ static bool globalRegistration() { return false; }
static bool dataLayoutTypeIsSized() { return false; }
static bool dataLayoutTypeAllocSize() { return false; }
static bool dataLayoutTypeStoreSize() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index f3ab733bf4c6a..9a20cef03d4ce 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -151,6 +151,16 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
/*line=*/0,
/*column=*/0));
}
+
+ // Set CUDA GPU binary handle.
+ if (langOpts.CUDA) {
+ std::string cudaBinaryName = codeGenOpts.CudaGpuBinaryFileName;
+ if (!cudaBinaryName.empty()) {
+ theModule->setAttr(
+ cir::CIRDialect::getCUDABinaryHandleAttrName(),
+ cir::CUDABinaryHandleAttr::get(&mlirContext, cudaBinaryName));
+ }
+ }
}
CIRGenModule::~CIRGenModule() = default;
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 756a453001658..2f2cf5a3d3b44 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -9,6 +9,7 @@
#include "PassDetail.h"
#include "mlir/IR/Attributes.h"
#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attrs.inc"
#include "clang/AST/Mangle.h"
#include "clang/Basic/Module.h"
#include "clang/Basic/Specifiers.h"
@@ -107,6 +108,17 @@ struct LoweringPreparePass
cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage,
cir::VisibilityKind visibility = cir::VisibilityKind::Default);
+
+ /// ------------
+ /// CUDA registration related
+ /// ------------
+
+ llvm::StringMap<FuncOp> cudaKernelMap;
+
+ /// Build the CUDA module constructor that registers the fat binary
+ /// with the CUDA runtime.
+ void buildCUDAModuleCtor();
+
/// Handle static local variable initialization with guard variables.
void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp);
@@ -1634,11 +1646,150 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
globalCtorList.emplace_back(fnOp.getName(), globalCtor.value());
else if (auto globalDtor = fnOp.getGlobalDtorPriority())
globalDtorList.emplace_back(fnOp.getName(), globalDtor.value());
+
+ if (auto attr = fnOp->getAttr(cir::CUDAKernelNameAttr::getMnemonic())) {
+ auto kernelNameAttr = dyn_cast<CUDAKernelNameAttr>(attr);
+ std::string kernelName = kernelNameAttr.getKernelName();
+ cudaKernelMap[kernelName] = fnOp;
+ }
} else if (auto threeWayCmp = dyn_cast<cir::CmpThreeWayOp>(op)) {
lowerThreeWayCmpOp(threeWayCmp);
}
}
+static std::string getCUDAPrefix(clang::ASTContext *astCtx) {
+ if (astCtx->getLangOpts().HIP)
+ return "hip";
+ return "cuda";
+}
+
+static std::string addUnderscoredPrefix(llvm::StringRef prefix,
+ llvm::StringRef name) {
+ return ("__" + prefix + name).str();
+}
+
+/// Creates a global constructor function for the module:
+///
+/// For CUDA:
+/// \code
+/// void __cuda_module_ctor() {
+/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
+/// __cuda_register_globals(Handle);
+/// }
+/// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_ctor() {
+/// if (__hip_gpubin_handle == 0) {
+/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
+/// __hip_register_globals(__hip_gpubin_handle);
+/// }
+/// }
+/// \endcode
+void LoweringPreparePass::buildCUDAModuleCtor() {
+ bool isHIP = astCtx->getLangOpts().HIP;
+
+ if (isHIP)
+ assert(!cir::MissingFeatures::hipModuleCtor());
+ if (astCtx->getLangOpts().GPURelocatableDeviceCode)
+ llvm_unreachable("GPU RDC NYI");
+
+ // For CUDA without -fgpu-rdc, it's safe to stop generating ctor
+ // if there's nothing to register.
+ if (cudaKernelMap.empty())
+ return;
+
+ // There's no device-side binary, so no need to proceed for CUDA.
+ // HIP has to create an external symbol in this case, which is NYI.
+ mlir::Attribute cudaBinaryHandleAttr =
+ mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName());
+ if (!cudaBinaryHandleAttr) {
+ if (astCtx->getLangOpts().HIP)
+ assert(!cir::MissingFeatures::hipModuleCtor());
+ return;
+ }
+
+ std::string cudaGPUBinaryName =
+ mlir::cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
+
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> gpuBinaryOrErr =
+ llvm::MemoryBuffer::getFile(cudaGPUBinaryName);
+ if (std::error_code ec = gpuBinaryOrErr.getError()) {
+ mlirModule->emitError("cannot open GPU binary file: " + cudaGPUBinaryName +
+ ": " + ec.message());
+ return;
+ }
+ std::unique_ptr<llvm::MemoryBuffer> gpuBinary =
+ std::move(gpuBinaryOrErr.get());
+
+ // Set up common types and builder.
+ std::string cudaPrefix = getCUDAPrefix(astCtx);
+ mlir::Location loc = mlirModule->getLoc();
+ CIRBaseBuilderTy builder(getContext());
+ builder.setInsertionPointToStart(mlirModule.getBody());
+
+ auto voidTy = builder.getVoidTy();
+ auto voidPtrTy = builder.getVoidPtrTy();
+ auto voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
+ auto intTy = builder.getSIntNTy(32);
+ auto charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
+ /*isSigned=*/false);
+
+ // --- Create fatbin globals ---
+
+ // Create the fatbin string constant with GPU binary contents.
+ auto fatbinType =
+ ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
+ std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+ GlobalOp fatbinStr =
+ GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
+ /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+ fatbinStr.setAlignment(8);
+ fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
+ fatbinType, builder.getStringAttr(gpuBinary->getBuffer())));
+ assert(!cir::MissingFeatures::opGlobalSection());
+ fatbinStr.setPrivate();
+
+ // Create the fatbin wrapper struct:
+ // struct { int magic; int version; void *fatbin; void *unused; };
+ auto fatbinWrapperType = RecordType::get(
+ &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
+ /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
+ std::string fatbinWrapperName =
+ addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
+ GlobalOp fatbinWrapper =
+ GlobalOp::create(builder, loc, fatbinWrapperName, fatbinWrapperType,
+ /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+
+ constexpr unsigned cudaFatMagic = 0x466243b1;
+ constexpr unsigned hipFatMagic = 0x48495046;
+ unsigned fatMagic = isHIP ? hipFatMagic : cudaFatMagic;
+
+ auto magicInit = IntAttr::get(intTy, fatMagic);
+ auto versionInit = IntAttr::get(intTy, 1);
+ auto fatbinStrSymbol =
+ mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr());
+ auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol);
+ auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
+ fatbinWrapper.setInitialValueAttr(cir::ConstRecordAttr::get(
+ fatbinWrapperType,
+ mlir::ArrayAttr::get(&getContext(),
+ {magicInit, versionInit, fatbinInit, unusedInit})));
+
+ // Create the GPU binary handle global variable.
+ std::string gpubinHandleName =
+ addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
+ GlobalOp gpuBinHandle = GlobalOp::create(
+ builder, loc, gpubinHandleName, voidPtrPtrTy,
+ /*isConstant=*/false, GlobalLinkageKind::InternalLinkage);
+ gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
+ gpuBinHandle.setPrivate();
+
+ // TODO: ctor/dtor/register_globals
+ assert(!cir::MissingFeatures::globalRegistration());
+}
+
void LoweringPreparePass::runOnOperation() {
mlir::Operation *op = getOperation();
if (isa<::mlir::ModuleOp>(op))
@@ -1659,6 +1810,9 @@ void LoweringPreparePass::runOnOperation() {
runOnOp(o);
buildCXXGlobalInitFunc();
+ if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice)
+ buildCUDAModuleCtor();
+
buildGlobalCtorDtorList();
}
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu
new file mode 100644
index 0000000000000..59bfd5b31d522
--- /dev/null
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -0,0 +1,50 @@
+// Based on clang/test/CodeGenCUDA/device-stub.cu (incubator).
+
+// Create a dummy GPU binary file for registration.
+// RUN: echo -n "GPU binary would be here." > %t
+
+// CIR output — check fatbin globals are created correctly.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
+// RUN: -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+// OGCG output — check LLVM IR parity with original codegen.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -x cuda \
+// RUN: -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// No GPU binary — nothing should be generated.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
+// RUN: -target-sdk-version=12.3 -o %t.nogpu.cir
+// RUN: FileCheck --input-file=%t.nogpu.cir %s --check-prefix=NOGPUBIN
+
+#include "Inputs/cuda.h"
+
+__global__ void kernelfunc(int i, int j, int k) {}
+
+void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+
+// Check the fatbin string constant with GPU binary contents.
+// CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = #cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> {alignment = 8 : i64}
+
+// Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }.
+// CIR: cir.global constant cir_private @__cuda_fatbin_wrapper = #cir.const_record<{
+// CIR-SAME: #cir.int<1180844977> : !s32i,
+// CIR-SAME: #cir.int<1> : !s32i,
+// CIR-SAME: #cir.global_view<@__cuda_fatbin_str> : !cir.ptr<!void>,
+// CIR-SAME: #cir.ptr<null> : !cir.ptr<!void>
+// CIR-SAME: }>
+
+// Check the GPU binary handle global.
+// CIR: cir.global "private" internal @__cuda_gpubin_handle = #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>>
+
+// OGCG: constant [25 x i8] c"GPU binary would be here.", section ".nv_fatbin", align 8
+// OGCG: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1180844977, i32 1, ptr @{{.*}}, ptr null }, section ".nvFatBinSegment"
+// OGCG: @__cuda_gpubin_handle = internal global ptr null
+
+// No GPU binary — no registration infrastructure at all.
+// NOGPUBIN-NOT: fatbin
+// NOGPUBIN-NOT: gpubin
+// NOGPUBIN-NOT: __cuda_register_globals
+// NOGPUBIN-NOT: __cuda_module_ctor
+// NOGPUBIN-NOT: __cuda_module_dtor
>From 7a0adcf3668bfb17ee74f89b24fd6b442a502556 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Mon, 23 Mar 2026 20:19:09 -0400
Subject: [PATCH 2/3] fix tests and remove unnecessary comments.
---
clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
clang/test/CIR/CodeGenCUDA/device-stub.cu | 3 ---
2 files changed, 4 deletions(-)
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 2f2cf5a3d3b44..b1d37a943f0ea 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -108,7 +108,6 @@ struct LoweringPreparePass
cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage,
cir::VisibilityKind visibility = cir::VisibilityKind::Default);
-
/// ------------
/// CUDA registration related
/// ------------
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index 59bfd5b31d522..b3e8baa17c7a4 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -3,17 +3,14 @@
// Create a dummy GPU binary file for registration.
// RUN: echo -n "GPU binary would be here." > %t
-// CIR output — check fatbin globals are created correctly.
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
// RUN: -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.cir
// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
-// OGCG output — check LLVM IR parity with original codegen.
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -x cuda \
// RUN: -target-sdk-version=12.3 -fcuda-include-gpubinary %t -o %t.ll
// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
-// No GPU binary — nothing should be generated.
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x cuda \
// RUN: -target-sdk-version=12.3 -o %t.nogpu.cir
// RUN: FileCheck --input-file=%t.nogpu.cir %s --check-prefix=NOGPUBIN
>From c21bf86e0c456fc0f90a672b938ed426d66c41a4 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Mon, 23 Mar 2026 20:46:17 -0400
Subject: [PATCH 3/3] fix global builder ordering
---
.../CIR/Dialect/Transforms/LoweringPrepare.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index b1d37a943f0ea..cc3a80e9842eb 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1741,9 +1741,9 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
auto fatbinType =
ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
- GlobalOp fatbinStr =
- GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
- /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+ GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
+ /*isConstant=*/true, {},
+ GlobalLinkageKind::PrivateLinkage);
fatbinStr.setAlignment(8);
fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
fatbinType, builder.getStringAttr(gpuBinary->getBuffer())));
@@ -1757,9 +1757,9 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
/*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
std::string fatbinWrapperName =
addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
- GlobalOp fatbinWrapper =
- GlobalOp::create(builder, loc, fatbinWrapperName, fatbinWrapperType,
- /*isConstant=*/true, GlobalLinkageKind::PrivateLinkage);
+ GlobalOp fatbinWrapper = GlobalOp::create(
+ builder, loc, fatbinWrapperName, fatbinWrapperType,
+ /*isConstant=*/true, {}, GlobalLinkageKind::PrivateLinkage);
constexpr unsigned cudaFatMagic = 0x466243b1;
constexpr unsigned hipFatMagic = 0x48495046;
@@ -1779,9 +1779,10 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
// Create the GPU binary handle global variable.
std::string gpubinHandleName =
addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
+
GlobalOp gpuBinHandle = GlobalOp::create(
builder, loc, gpubinHandleName, voidPtrPtrTy,
- /*isConstant=*/false, GlobalLinkageKind::InternalLinkage);
+ /*isConstant=*/false, {}, cir::GlobalLinkageKind::InternalLinkage);
gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
gpuBinHandle.setPrivate();
More information about the cfe-commits
mailing list