[clang] [CIR][CUDA] Do Runtime Kernel Registration (PR #188926)

Tue Apr 14 12:56:32 PDT 2026

https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/188926

>From 2adde40164d6026cb60e7af9e1d8bda7694636bb Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Wed, 25 Mar 2026 23:14:58 -0400
Subject: [PATCH 1/6] Avoid copies from `std::string`

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 19b14efff1c65..ef9c4556b9412 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Cuda.h"
@@ -1750,9 +1751,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext *astCtx) {
   return "cuda";
 }
 
-static std::string addUnderscoredPrefix(llvm::StringRef prefix,
+static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).str();
+  return ("__" + prefix + name).getSingleStringRef();
 }
 
 /// Creates a global constructor function for the module:
@@ -1839,7 +1840,8 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  llvm::StringRef fatbinStrName =
+      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1854,7 +1856,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  std::string fatbinWrapperName =
+  llvm::StringRef fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1877,7 +1879,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  std::string gpubinHandleName =
+  llvm::StringRef gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From dc5ee921e0505aef340ec9edc8beb33542443336 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 27 Mar 2026 03:38:57 -0400
Subject: [PATCH 2/6] fix twine crashes

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index ef9c4556b9412..9e581b089f946 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1751,9 +1751,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext *astCtx) {
   return "cuda";
 }
 
-static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
+static std::string addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).getSingleStringRef();
+  return ("__" + prefix + name).str();
 }
 
 /// Creates a global constructor function for the module:
@@ -1840,8 +1840,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  llvm::StringRef fatbinStrName =
-      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1856,7 +1855,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  llvm::StringRef fatbinWrapperName =
+  std::string fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1879,7 +1878,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  llvm::StringRef gpubinHandleName =
+  std::string gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From ccbc1bc53478814c8f7af46746cc8d7952e47e8f Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Thu, 2 Apr 2026 13:45:13 -0400
Subject: [PATCH 3/6] fix fmt

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 9e581b089f946..19b14efff1c65 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,7 +10,6 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Cuda.h"

>From de686986fa84a02bb08be4a913443f39f0a1817c Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Wed, 25 Mar 2026 22:29:47 -0400
Subject: [PATCH 4/6] [CIR][CUDA] Handle CUDA module constructor and destructor
 emission.

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 19b14efff1c65..2b38f9cbecfba 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Value.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Cuda.h"

>From df4d636b90dc39b83d23a39e8d3e643f096594a9 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 27 Mar 2026 04:40:08 -0400
Subject: [PATCH 5/6] [CIR][CUDA] Do Runtime Kernel Registration

---
 .../Dialect/Transforms/LoweringPrepare.cpp    | 121 +++++++++++++++++-
 clang/test/CIR/CodeGenCUDA/device-stub.cu     |  30 ++++-
 2 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 2b38f9cbecfba..de7ab647c177c 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/Value.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
@@ -31,6 +32,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -128,6 +130,9 @@ struct LoweringPreparePass
   /// with the CUDA runtime.
   void buildCUDAModuleCtor();
   std::optional<FuncOp> buildCUDAModuleDtor();
+  std::optional<FuncOp> buildCUDARegisterGlobals();
+  void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
+                                        FuncOp regGlobalFunc);
 
   /// Handle static local variable initialization with guard variables.
   void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp);
@@ -1923,8 +1928,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
     mlir::Value gpuBinaryHandleGlobal = builder.createGetGlobal(gpuBinHandle);
     builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
 
-    // TODO: Generate __cuda_register_globals and emit a call.
-    assert(!cir::MissingFeatures::globalRegistration());
+    // --- Generate __cuda_register_globals and call it ---
+    std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
+    if (regGlobal) {
+      builder.createCallOp(loc, *regGlobal, gpuBinaryHandle);
+    }
 
     // From CUDA 10.1 onwards, we must call this function to end registration:
     //      void __cudaRegisterFatBinaryEnd(void **fatbinHandle);
@@ -2006,6 +2014,115 @@ std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() {
   return dtor;
 }
 
+std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() {
+  // There is nothing to register.
+  if (cudaKernelMap.empty())
+    return {};
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(mlirModule.getBody());
+
+  mlir::Location loc = mlirModule.getLoc();
+  llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx);
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrTy = PointerType::get(voidTy);
+  auto voidPtrPtrTy = PointerType::get(voidPtrTy);
+
+  // Create the function:
+  //      void __cuda_register_globals(void **fatbinHandle)
+  std::string regGlobalFuncName =
+      addUnderscoredPrefix(cudaPrefix, "_register_globals");
+  auto regGlobalFuncTy = FuncType::get({voidPtrPtrTy}, voidTy);
+  FuncOp regGlobalFunc =
+      buildRuntimeFunction(builder, regGlobalFuncName, loc, regGlobalFuncTy,
+                           /*linkage=*/GlobalLinkageKind::InternalLinkage);
+  builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock());
+
+  buildCUDARegisterGlobalFunctions(builder, regGlobalFunc);
+  // TODO: Handle shadow registration
+  assert(!cir::MissingFeatures::globalRegistration());
+
+  ReturnOp::create(builder, loc);
+  return regGlobalFunc;
+}
+
+void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
+    cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc) {
+  mlir::Location loc = mlirModule.getLoc();
+  llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx);
+  cir::CIRDataLayout dataLayout(mlirModule);
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrTy = PointerType::get(voidTy);
+  auto voidPtrPtrTy = PointerType::get(voidPtrTy);
+  IntType intTy = builder.getSIntNTy(32);
+  IntType charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(),
+                                     /*isSigned=*/false);
+
+  // Extract the GPU binary handle argument.
+  mlir::Value fatbinHandle = *regGlobalFunc.args_begin();
+
+  cir::CIRBaseBuilderTy globalBuilder(getContext());
+  globalBuilder.setInsertionPointToStart(mlirModule.getBody());
+
+  // Declare CUDA internal functions:
+  // int __cudaRegisterFunction(
+  //   void **fatbinHandle,
+  //   const char *hostFunc,
+  //   char *deviceFunc,
+  //   const char *deviceName,
+  //   int threadLimit,
+  //   uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim,
+  //   int *wsize
+  // )
+  // OG doesn't care about the types at all. They're treated as void*.
+
+  FuncOp cudaRegisterFunction = buildRuntimeFunction(
+      globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterFunction"), loc,
+      FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy,
+                     voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy},
+                    intTy));
+
+  auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
+    auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());
+    auto tmpString = cir::GlobalOp::create(
+        globalBuilder, loc, (".str" + str).str(), strType,
+        /*isConstant=*/true, {},
+        /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
+
+    // We must make the string zero-terminated.
+    tmpString.setInitialValueAttr(ConstArrayAttr::get(
+        strType, StringAttr::get(&getContext(), str + "\0")));
+    tmpString.setPrivate();
+    return tmpString;
+  };
+
+  cir::ConstantOp cirNullPtr = builder.getNullPtr(voidPtrTy, loc);
+  bool isHIP = astCtx->getLangOpts().HIP;
+  for (auto kernelName : cudaKernelMap.keys()) {
+    FuncOp deviceStub = cudaKernelMap[kernelName];
+    GlobalOp deviceFuncStr = makeConstantString(kernelName);
+    mlir::Value deviceFunc = builder.createBitcast(
+        builder.createGetGlobal(deviceFuncStr), voidPtrTy);
+
+    if (isHIP) {
+      llvm_unreachable("HIP kernel registration NYI");
+    } else {
+      mlir::Value hostFunc = builder.createBitcast(
+          GetGlobalOp::create(
+              builder, loc, PointerType::get(deviceStub.getFunctionType()),
+              mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
+          voidPtrTy);
+      builder.createCallOp(
+          loc, cudaRegisterFunction,
+          {fatbinHandle, hostFunc, deviceFunc, deviceFunc,
+           ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
+           cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
+    }
+  }
+}
+
 void LoweringPreparePass::runOnOperation() {
   mlir::Operation *op = getOperation();
   if (isa<::mlir::ModuleOp>(op))
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index 4562bf1523141..0f9d4d68d67ff 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -24,7 +24,7 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // Check module constructor is registered in module attributes.
 // CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>]
 
-// Check runtime function declarations (appear before dtor in output).
+// Check runtime function declarations.
 // CIR: cir.func private @atexit(!cir.ptr<!cir.func<()>>) -> !s32i
 // CIR: cir.func private @__cudaUnregisterFatBinary(!cir.ptr<!cir.ptr<!void>>)
 
@@ -37,6 +37,25 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 
 // CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>)
 
+// Check the __cudaRegisterFunction runtime declaration:
+//   int __cudaRegisterFunction(void**, void*, void*, void*, int,
+//                              void*, void*, void*, void*, void*)
+// CIR: cir.func private @__cudaRegisterFunction(!cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !s32i, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>) -> !s32i
+
+// Check the device-side name string for kernelfunc (mangled, null-terminated).
+// CIR: cir.global "private" constant cir_private @".str_Z10kernelfunciii" = #cir.const_array<"_Z10kernelfunciii", trailing_zeros> : !cir.array<!u8i x 18>
+
+// Check __cuda_register_globals body: one __cudaRegisterFunction call per kernel.
+// CIR: cir.func internal private @__cuda_register_globals(%arg0: !cir.ptr<!cir.ptr<!void>>
+// CIR-NEXT: %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
+// CIR-NEXT: %[[STR_ADDR:.*]] = cir.get_global @".str_Z10kernelfunciii"
+// CIR-NEXT: %[[DEVICE_FUNC:.*]] = cir.cast bitcast %[[STR_ADDR]]
+// CIR-NEXT: %[[HOST_FUNC_RAW:.*]] = cir.get_global @{{.*}}kernelfunc{{.*}}
+// CIR-NEXT: %[[HOST_FUNC:.*]] = cir.cast bitcast %[[HOST_FUNC_RAW]]
+// CIR-NEXT: %[[THREAD_LIMIT:.*]] = cir.const #cir.int<-1> : !s32i
+// CIR-NEXT: cir.call @__cudaRegisterFunction(%{{.*}}, %[[HOST_FUNC]], %[[DEVICE_FUNC]], %[[DEVICE_FUNC]], %[[THREAD_LIMIT]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]])
+// CIR-NEXT: cir.return
+
 // CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = #cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> {alignment = 8 : i64, section = ".nv_fatbin"}
 
 // Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }, with section.
@@ -53,13 +72,15 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // CIR: cir.func private @__cudaRegisterFatBinary(!cir.ptr<!void>) -> !cir.ptr<!cir.ptr<!void>>
 
 // Check the module constructor body: register fatbin, store handle,
-// call RegisterFatBinaryEnd (CUDA >= 10.1), then register dtor with atexit.
+// call __cuda_register_globals, call RegisterFatBinaryEnd (CUDA >= 10.1),
+// then register dtor with atexit.
 // CIR: cir.func internal private @__cuda_module_ctor()
 // CIR-NEXT: %[[WRAPPER:.*]] = cir.get_global @__cuda_fatbin_wrapper
 // CIR-NEXT: %[[VOID_PTR:.*]] = cir.cast bitcast %[[WRAPPER]]
 // CIR-NEXT: %[[RET:.*]] = cir.call @__cudaRegisterFatBinary(%[[VOID_PTR]])
 // CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle
 // CIR-NEXT: cir.store %[[RET]], %[[HANDLE_ADDR]]
+// CIR-NEXT: cir.call @__cuda_register_globals(%[[RET]])
 // CIR-NEXT: cir.call @__cudaRegisterFatBinaryEnd(%[[RET]])
 // CIR-NEXT: %[[DTOR_PTR:.*]] = cir.get_global @__cuda_module_dtor
 // CIR-NEXT: {{.*}} = cir.call @atexit(%[[DTOR_PTR]])
@@ -70,9 +91,14 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // OGCG: @__cuda_gpubin_handle = internal global ptr null
 // OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
 
+// OGCG: define internal void @__cuda_register_globals
+// OGCG: call{{.*}}__cudaRegisterFunction(ptr %0, {{.*}}kernelfunc{{.*}}, ptr @0
+// OGCG: ret void
+
 // OGCG: define internal void @__cuda_module_ctor
 // OGCG: call{{.*}}__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
 // OGCG: store ptr %{{.*}}, ptr @__cuda_gpubin_handle
+// OGCG-NEXT: call void @__cuda_register_globals
 // OGCG: call i32 @atexit(ptr @__cuda_module_dtor)
 
 // OGCG: define internal void @__cuda_module_dtor

>From 1a81469a70f6587ebe8f0bea58c7ab86ae6e49f9 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Tue, 14 Apr 2026 15:56:16 -0400
Subject: [PATCH 6/6] address comment

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index de7ab647c177c..9015fe483a7df 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1929,8 +1929,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
     builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
 
     // --- Generate __cuda_register_globals and call it ---
-    std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
-    if (regGlobal) {
+    if (std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals()) {
       builder.createCallOp(loc, *regGlobal, gpuBinaryHandle);
     }