[flang-commits] [flang] e826168 - [flang][cuda] Add CUFLaunchAttachAttr pass (#174465)

Mon Jan 5 13:05:39 PST 2026

Author: Valentin Clement (バレンタイン クレメン)
Date: 2026-01-05T13:05:34-08:00
New Revision: e826168a24479c7613f098daa19248d4b494ee96

URL: https://github.com/llvm/llvm-project/commit/e826168a24479c7613f098daa19248d4b494ee96
DIFF: https://github.com/llvm/llvm-project/commit/e826168a24479c7613f098daa19248d4b494ee96.diff

LOG: [flang][cuda] Add CUFLaunchAttachAttr pass (#174465)

CUF kernel are generated via gpu.launch and then outlined. The resulting
launch operation needs to hava a CUDA attribute attached so it will
be distinguishable from other launch.

Added: 
    flang/lib/Optimizer/Transforms/CUDA/CUFLaunchAttachAttr.cpp
    flang/test/Fir/CUDA/cuda-launch-attach-attr.mlir

Modified: 
    flang/include/flang/Optimizer/Transforms/Passes.h
    flang/include/flang/Optimizer/Transforms/Passes.td
    flang/lib/Optimizer/Transforms/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 4dcdddaac8ee5..50e8e6c58bf62 100644

--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_OPTIMIZER_TRANSFORMS_PASSES_H
 #define FORTRAN_OPTIMIZER_TRANSFORMS_PASSES_H
 
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"

diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index d457cbbb32798..a2a3341bfa667 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -523,6 +523,16 @@ def CUFComputeSharedMemoryOffsetsAndSize
   ];
 }
 
+def CUFLaunchAttachAttr : Pass<"cuf-launch-attach-attr", ""> {
+  let summary = "Attach CUDA attribute to CUF kernel generated launch";
+  let description = [{
+    CUF kernel is generated via gpu.launch and then outlined. The resulting
+    launch operation needs to hava a CUDA attribute attached so it will
+    be distinguishable from other launch.
+  }];
+  let dependentDialects = ["cuf::CUFDialect"];
+}
+
 def CUFPredefinedVarToGPU
     : Pass<"cuf-predefined-var-to-gpu", "::mlir::func::FuncOp"> {
   let summary = "Transform predefined variables to GPU operations";

diff  --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index c07921b23f3ff..1e2d6e8f15e92 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_flang_library(FIRTransforms
   ConstantArgumentGlobalisation.cpp
   ControlFlowConverter.cpp
   CUDA/CUFAllocationConversion.cpp
+  CUDA/CUFLaunchAttachAttr.cpp
   CUDA/CUFPredefinedVarToGPU.cpp
   CUFAddConstructor.cpp
   CUFDeviceGlobal.cpp

diff  --git a/flang/lib/Optimizer/Transforms/CUDA/CUFLaunchAttachAttr.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFLaunchAttachAttr.cpp
new file mode 100644
index 0000000000000..41a0e5c7dceec
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFLaunchAttachAttr.cpp
@@ -0,0 +1,70 @@
+//===-- CUFLaunchAttachAttr.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFLAUNCHATTACHATTR
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+
+namespace {
+
+static constexpr llvm::StringRef cudaKernelInfix = "_cufk_";
+
+class CUFGPUAttachAttrPattern
+    : public OpRewritePattern<mlir::gpu::LaunchFuncOp> {
+  using OpRewritePattern<mlir::gpu::LaunchFuncOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(mlir::gpu::LaunchFuncOp op,
+                                PatternRewriter &rewriter) const override {
+    op->setAttr(cuf::getProcAttrName(),
+                cuf::ProcAttributeAttr::get(op.getContext(),
+                                            cuf::ProcAttribute::Global));
+    return mlir::success();
+  }
+};
+
+struct CUFLaunchAttachAttr
+    : public fir::impl::CUFLaunchAttachAttrBase<CUFLaunchAttachAttr> {
+
+  void runOnOperation() override {
+    auto *context = &this->getContext();
+
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<CUFGPUAttachAttrPattern>(context);
+
+    mlir::ConversionTarget target(*context);
+    target.addIllegalOp<mlir::gpu::LaunchFuncOp>();
+    target.addDynamicallyLegalOp<mlir::gpu::LaunchFuncOp>(
+        [&](mlir::gpu::LaunchFuncOp op) -> bool {
+          if (op.getKernelName().getValue().contains(cudaKernelInfix)) {
+            if (op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+                    cuf::getProcAttrName()))
+              return true;
+            return false;
+          }
+          return true;
+        });
+
+    if (mlir::failed(mlir::applyPartialConversion(this->getOperation(), target,
+                                                  std::move(patterns)))) {
+      mlir::emitError(mlir::UnknownLoc::get(context),
+                      "Pattern conversion failed\n");
+      this->signalPassFailure();
+    }
+  }
+};
+
+} // end anonymous namespace

diff  --git a/flang/test/Fir/CUDA/cuda-launch-attach-attr.mlir b/flang/test/Fir/CUDA/cuda-launch-attach-attr.mlir
new file mode 100644
index 0000000000000..efaea1d9ba63b
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-launch-attach-attr.mlir
@@ -0,0 +1,19 @@
+// RUN: fir-opt --split-input-file --cuf-launch-attach-attr %s | FileCheck %s
+
+module attributes {gpu.container_module} {
+  func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+    %0 = arith.constant 1 : i64
+    %1 = arith.constant 2 : i64
+    %3 = arith.constant 10 : i64
+    gpu.launch_func  @cuda_device_mod::@_QMtest_cufk_20 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 
+    gpu.launch_func  @cuda_device_mod::@_QMtest2 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64
+    gpu.launch_func  @cuda_device_mod::@_QMtest_cufk_22 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 {cuf.proc_attr = #cuf.cuda_proc<global>}
+    return
+  }
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMtest_cufk_20 blocks in ({{.*}}) threads in ({{.*}}) : i64 {cuf.proc_attr = #cuf.cuda_proc<global>}
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMtest2 blocks in ({{.*}}) threads in ({{.*}}) : i64 {{$}}
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMtest_cufk_22 blocks in ({{.*}}) threads in ({{.*}}) : i64 {cuf.proc_attr = #cuf.cuda_proc<global>}