[flang-commits] [flang] [flang][cuda] Preserve fir.rebox captured by cuf.kernel via CUDAKernelOpInterface (PR #193890)

via flang-commits flang-commits at lists.llvm.org
Thu Apr 23 20:46:11 PDT 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-flang-fir-hlfir

Author: Zhen Wang (wangzpgi)

<details>
<summary>Changes</summary>

Reland of #<!-- -->193837 (reverted in #<!-- -->193855), now using a marker op interface to avoid the link cycle that broke `BUILD_SHARED_LIBS=ON` builds.

`SimplifyArrayCoorOp` folded `fir.rebox` into `fir.array_coor` across a `cuf.kernel` boundary. CUF lowering needs the captured rebox to materialize a managed-memory descriptor for the kernel; folding it away makes the kernel dereference the host-side descriptor and crash with `cudaErrorIllegalAddress`.

Fix is to add `fir::CUDAKernelOpInterface`, a marker op interface defined in FIRDialect and implemented by `cuf.kernel`. The canonicalization guard queries the interface, so the `TypeIDResolver` symbol lives in `libFIRDialect.so` and no `FIR -> CUF` link edge is introduced.

---
Full diff: https://github.com/llvm/llvm-project/pull/193890.diff


9 Files Affected:

- (modified) flang/include/flang/Optimizer/Dialect/CMakeLists.txt (+5) 
- (added) flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h (+21) 
- (added) flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td (+34) 
- (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h (+1) 
- (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+2-1) 
- (modified) flang/lib/Optimizer/Dialect/CMakeLists.txt (+2) 
- (added) flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp (+15) 
- (modified) flang/lib/Optimizer/Dialect/FIROps.cpp (+10) 
- (added) flang/test/Fir/array-coor-canonicalization-cuf.fir (+66) 


``````````diff
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index 26cda19826434..7c4ec2ec598f7 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -40,6 +40,11 @@ mlir_tablegen(FIROperationMoveOpInterface.h.inc -gen-op-interface-decls)
 mlir_tablegen(FIROperationMoveOpInterface.cpp.inc -gen-op-interface-defs)
 add_public_tablegen_target(FIROperationMoveOpInterfaceIncGen)
 
+set(LLVM_TARGET_DEFINITIONS CUDAKernelOpInterface.td)
+mlir_tablegen(CUDAKernelOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(CUDAKernelOpInterface.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(CUDAKernelOpInterfaceIncGen)
+
 set(LLVM_TARGET_DEFINITIONS CanonicalizationPatterns.td)
 mlir_tablegen(CanonicalizationPatterns.inc -gen-rewriters)
 add_public_tablegen_target(CanonicalizationPatternsIncGen)
diff --git a/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h
new file mode 100644
index 0000000000000..ed412108ec703
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h
@@ -0,0 +1,21 @@
+//===- CUDAKernelOpInterface.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the FIR marker interface for operations that act as a
+// CUDA kernel boundary (e.g. cuf.kernel).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
+#define FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
+
+#include "mlir/IR/OpDefinition.h"
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td
new file mode 100644
index 0000000000000..e60e624ca4912
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td
@@ -0,0 +1,34 @@
+//===-- CUDAKernelOpInterface.td ----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Marker interface for operations that semantically represent a CUDA kernel
+// boundary in Flang's lowered IR (e.g. cuf.kernel). The interface is defined
+// in the FIR dialect so that FIR-level transformations can detect such
+// boundaries without having to depend on the CUF dialect (which already
+// depends on FIR; the reverse edge would form a circular library dependency
+// in BUILD_SHARED_LIBS=ON builds).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
+#define FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def CUDAKernelOpInterface : OpInterface<"CUDAKernelOpInterface"> {
+  let description = [{
+    Marker interface for operations that act as a CUDA kernel boundary in
+    Flang's lowered IR. Operations implementing this interface are treated by
+    FIR transformations as opaque kernel regions whose host-visible captures
+    must not be folded across the boundary.
+  }];
+
+  let cppNamespace = "::fir";
+}
+
+#endif // FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
index d63d6142e5d66..7088d9d6d3eaa 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
 #define FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
 
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
 #include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
 #include "flang/Optimizer/Dialect/FIROperationMoveOpInterface.h"
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index e5134a591e3ce..d5bf5503d3ced 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -14,6 +14,7 @@
 #ifndef FORTRAN_DIALECT_CUF_CUF_OPS
 #define FORTRAN_DIALECT_CUF_CUF_OPS
 
+include "flang/Optimizer/Dialect/CUDAKernelOpInterface.td"
 include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRAttr.td"
@@ -246,7 +247,7 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
 }
 
 def cuf_KernelOp
-    : cuf_Op<"kernel", [AttrSizedOperandSegments,
+    : cuf_Op<"kernel", [AttrSizedOperandSegments, CUDAKernelOpInterface,
                         DeclareOpInterfaceMethods<LoopLikeOpInterface>,
                         DeclareOpInterfaceMethods<OperationMoveOpInterface>]> {
 
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 0581e18ab0763..8fc076d88b78a 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(FIRCG)
 add_subdirectory(MIF)
 
 add_flang_library(FIRDialect
+  CUDAKernelOpInterface.cpp
   FIRAttr.cpp
   FIRDialect.cpp
   FIROperationMoveOpInterface.cpp
@@ -16,6 +17,7 @@ add_flang_library(FIRDialect
 
   DEPENDS
   CanonicalizationPatternsIncGen
+  CUDAKernelOpInterfaceIncGen
   FIROperationMoveOpInterfaceIncGen
   FIROpsIncGen
   FIRSafeTempArrayCopyAttrInterfaceIncGen
diff --git a/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp b/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp
new file mode 100644
index 0000000000000..1dc192ada4f55
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp
@@ -0,0 +1,15 @@
+//===-- CUDAKernelOpInterface.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.cpp.inc"
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 4705033945611..253d0df911dd0 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
@@ -587,6 +588,15 @@ struct SimplifyArrayCoorOp : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
             return mlir::isa<ACC_DATA_ENTRY_OPS>(u);
           }))
         return mlir::failure();
+      // Don't pull in rebox defined outside a CUDA kernel boundary when the
+      // array_coor is inside that kernel. CUF lowering converts such a rebox
+      // into a managed-memory descriptor that the kernel needs to receive as
+      // its argument; folding the rebox away would leave the kernel capturing
+      // the host-side descriptor directly, causing illegal device dereferences
+      // at runtime.
+      if (op->getParentOfType<fir::CUDAKernelOpInterface>() !=
+          reboxOp->getParentOfType<fir::CUDAKernelOpInterface>())
+        return mlir::failure();
       boxedMemref = reboxOp.getBox();
       boxedShape = reboxOp.getShape();
       // Avoid pulling in rebox that performs reshaping.
diff --git a/flang/test/Fir/array-coor-canonicalization-cuf.fir b/flang/test/Fir/array-coor-canonicalization-cuf.fir
new file mode 100644
index 0000000000000..733206d7a797b
--- /dev/null
+++ b/flang/test/Fir/array-coor-canonicalization-cuf.fir
@@ -0,0 +1,66 @@
+// RUN: fir-opt --canonicalize --split-input-file %s | FileCheck %s
+
+// Verify that fir.rebox is NOT folded into fir.array_coor when the rebox is
+// defined outside a cuf.kernel and the array_coor is inside that kernel.
+// CUF lowering converts such a rebox into a managed-memory descriptor that
+// the kernel needs to receive as its argument; folding the rebox away would
+// leave the kernel capturing the host-side descriptor directly, causing
+// illegal device dereferences at runtime.
+// CHECK-LABEL:   func.func @test_cuf_kernel_preserves_captured_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME:        %[[VAL:.*]]: f32
+// CHECK:           %[[REBOX:.*]] = fir.rebox %[[BOX]]
+// CHECK:           cuf.kernel
+// CHECK:             %[[COOR:.*]] = fir.array_coor %[[REBOX]]
+// CHECK:             fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_preserves_captured_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+  %c1_i32 = arith.constant 1 : i32
+  %c1 = arith.constant 1 : index
+  %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+    %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    fir.store %arg1 to %1 : !fir.ref<f32>
+    "fir.end"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// When both the rebox and the array_coor are inside the same cuf.kernel, the
+// capture boundary is not crossed and folding must still happen.
+// CHECK-LABEL:   func.func @test_cuf_kernel_folds_local_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME:        %[[VAL:.*]]: f32
+// CHECK-NOT:       fir.rebox
+// CHECK:           cuf.kernel
+// CHECK-NOT:         fir.rebox
+// CHECK:             %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK:             fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_folds_local_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+  %c1_i32 = arith.constant 1 : i32
+  %c1 = arith.constant 1 : index
+  cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+    %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+    %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    fir.store %arg1 to %1 : !fir.ref<f32>
+    "fir.end"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// Without cuf.kernel, rebox SHOULD be folded normally (regression: the guard
+// must not affect ordinary host code).
+// CHECK-LABEL:   func.func @test_no_cuf_folds_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+// CHECK-NOT:       fir.rebox
+// CHECK:           %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK:           return %[[COOR]] : !fir.ref<f32>
+func.func @test_no_cuf_folds_rebox(%arg0: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+  %c1 = arith.constant 1 : index
+  %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+  return %1 : !fir.ref<f32>
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/193890


More information about the flang-commits mailing list