[flang-commits] [flang] [flang][cuda] Preserve fir.rebox captured by cuf.kernel via CUDAKernelOpInterface (PR #193890)
via flang-commits
flang-commits at lists.llvm.org
Thu Apr 23 20:46:11 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Zhen Wang (wangzpgi)
<details>
<summary>Changes</summary>
Reland of #<!-- -->193837 (reverted in #<!-- -->193855), now using a marker op interface to avoid the link cycle that broke `BUILD_SHARED_LIBS=ON` builds.
`SimplifyArrayCoorOp` folded `fir.rebox` into `fir.array_coor` across a `cuf.kernel` boundary. CUF lowering needs the captured rebox to materialize a managed-memory descriptor for the kernel; folding it away makes the kernel dereference the host-side descriptor and crash with `cudaErrorIllegalAddress`.
Fix is to add `fir::CUDAKernelOpInterface`, a marker op interface defined in FIRDialect and implemented by `cuf.kernel`. The canonicalization guard queries the interface, so the `TypeIDResolver` symbol lives in `libFIRDialect.so` and no `FIR -> CUF` link edge is introduced.
---
Full diff: https://github.com/llvm/llvm-project/pull/193890.diff
9 Files Affected:
- (modified) flang/include/flang/Optimizer/Dialect/CMakeLists.txt (+5)
- (added) flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h (+21)
- (added) flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td (+34)
- (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h (+1)
- (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+2-1)
- (modified) flang/lib/Optimizer/Dialect/CMakeLists.txt (+2)
- (added) flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp (+15)
- (modified) flang/lib/Optimizer/Dialect/FIROps.cpp (+10)
- (added) flang/test/Fir/array-coor-canonicalization-cuf.fir (+66)
``````````diff
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index 26cda19826434..7c4ec2ec598f7 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -40,6 +40,11 @@ mlir_tablegen(FIROperationMoveOpInterface.h.inc -gen-op-interface-decls)
mlir_tablegen(FIROperationMoveOpInterface.cpp.inc -gen-op-interface-defs)
add_public_tablegen_target(FIROperationMoveOpInterfaceIncGen)
+set(LLVM_TARGET_DEFINITIONS CUDAKernelOpInterface.td)
+mlir_tablegen(CUDAKernelOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(CUDAKernelOpInterface.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(CUDAKernelOpInterfaceIncGen)
+
set(LLVM_TARGET_DEFINITIONS CanonicalizationPatterns.td)
mlir_tablegen(CanonicalizationPatterns.inc -gen-rewriters)
add_public_tablegen_target(CanonicalizationPatternsIncGen)
diff --git a/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h
new file mode 100644
index 0000000000000..ed412108ec703
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.h
@@ -0,0 +1,21 @@
+//===- CUDAKernelOpInterface.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the FIR marker interface for operations that act as a
+// CUDA kernel boundary (e.g. cuf.kernel).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
+#define FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
+
+#include "mlir/IR/OpDefinition.h"
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_CUDA_KERNEL_OP_INTERFACE_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td
new file mode 100644
index 0000000000000..e60e624ca4912
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUDAKernelOpInterface.td
@@ -0,0 +1,34 @@
+//===-- CUDAKernelOpInterface.td ----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Marker interface for operations that semantically represent a CUDA kernel
+// boundary in Flang's lowered IR (e.g. cuf.kernel). The interface is defined
+// in the FIR dialect so that FIR-level transformations can detect such
+// boundaries without having to depend on the CUF dialect (which already
+// depends on FIR; the reverse edge would form a circular library dependency
+// in BUILD_SHARED_LIBS=ON builds).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
+#define FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def CUDAKernelOpInterface : OpInterface<"CUDAKernelOpInterface"> {
+ let description = [{
+ Marker interface for operations that act as a CUDA kernel boundary in
+ Flang's lowered IR. Operations implementing this interface are treated by
+ FIR transformations as opaque kernel regions whose host-visible captures
+ must not be folded across the boundary.
+ }];
+
+ let cppNamespace = "::fir";
+}
+
+#endif // FORTRAN_DIALECT_CUDA_KERNEL_OP_INTERFACE
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
index d63d6142e5d66..7088d9d6d3eaa 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
@@ -9,6 +9,7 @@
#ifndef FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
#define FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
#include "flang/Optimizer/Dialect/FIROperationMoveOpInterface.h"
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index e5134a591e3ce..d5bf5503d3ced 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -14,6 +14,7 @@
#ifndef FORTRAN_DIALECT_CUF_CUF_OPS
#define FORTRAN_DIALECT_CUF_CUF_OPS
+include "flang/Optimizer/Dialect/CUDAKernelOpInterface.td"
include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
include "flang/Optimizer/Dialect/FIRAttr.td"
@@ -246,7 +247,7 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
}
def cuf_KernelOp
- : cuf_Op<"kernel", [AttrSizedOperandSegments,
+ : cuf_Op<"kernel", [AttrSizedOperandSegments, CUDAKernelOpInterface,
DeclareOpInterfaceMethods<LoopLikeOpInterface>,
DeclareOpInterfaceMethods<OperationMoveOpInterface>]> {
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 0581e18ab0763..8fc076d88b78a 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(FIRCG)
add_subdirectory(MIF)
add_flang_library(FIRDialect
+ CUDAKernelOpInterface.cpp
FIRAttr.cpp
FIRDialect.cpp
FIROperationMoveOpInterface.cpp
@@ -16,6 +17,7 @@ add_flang_library(FIRDialect
DEPENDS
CanonicalizationPatternsIncGen
+ CUDAKernelOpInterfaceIncGen
FIROperationMoveOpInterfaceIncGen
FIROpsIncGen
FIRSafeTempArrayCopyAttrInterfaceIncGen
diff --git a/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp b/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp
new file mode 100644
index 0000000000000..1dc192ada4f55
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUDAKernelOpInterface.cpp
@@ -0,0 +1,15 @@
+//===-- CUDAKernelOpInterface.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
+
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.cpp.inc"
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 4705033945611..253d0df911dd0 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/CUDAKernelOpInterface.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
@@ -587,6 +588,15 @@ struct SimplifyArrayCoorOp : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
return mlir::isa<ACC_DATA_ENTRY_OPS>(u);
}))
return mlir::failure();
+ // Don't pull in rebox defined outside a CUDA kernel boundary when the
+ // array_coor is inside that kernel. CUF lowering converts such a rebox
+ // into a managed-memory descriptor that the kernel needs to receive as
+ // its argument; folding the rebox away would leave the kernel capturing
+ // the host-side descriptor directly, causing illegal device dereferences
+ // at runtime.
+ if (op->getParentOfType<fir::CUDAKernelOpInterface>() !=
+ reboxOp->getParentOfType<fir::CUDAKernelOpInterface>())
+ return mlir::failure();
boxedMemref = reboxOp.getBox();
boxedShape = reboxOp.getShape();
// Avoid pulling in rebox that performs reshaping.
diff --git a/flang/test/Fir/array-coor-canonicalization-cuf.fir b/flang/test/Fir/array-coor-canonicalization-cuf.fir
new file mode 100644
index 0000000000000..733206d7a797b
--- /dev/null
+++ b/flang/test/Fir/array-coor-canonicalization-cuf.fir
@@ -0,0 +1,66 @@
+// RUN: fir-opt --canonicalize --split-input-file %s | FileCheck %s
+
+// Verify that fir.rebox is NOT folded into fir.array_coor when the rebox is
+// defined outside a cuf.kernel and the array_coor is inside that kernel.
+// CUF lowering converts such a rebox into a managed-memory descriptor that
+// the kernel needs to receive as its argument; folding the rebox away would
+// leave the kernel capturing the host-side descriptor directly, causing
+// illegal device dereferences at runtime.
+// CHECK-LABEL: func.func @test_cuf_kernel_preserves_captured_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME: %[[VAL:.*]]: f32
+// CHECK: %[[REBOX:.*]] = fir.rebox %[[BOX]]
+// CHECK: cuf.kernel
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[REBOX]]
+// CHECK: fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_preserves_captured_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+ %c1_i32 = arith.constant 1 : i32
+ %c1 = arith.constant 1 : index
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ fir.store %arg1 to %1 : !fir.ref<f32>
+ "fir.end"() : () -> ()
+ }
+ return
+}
+
+// -----
+
+// When both the rebox and the array_coor are inside the same cuf.kernel, the
+// capture boundary is not crossed and folding must still happen.
+// CHECK-LABEL: func.func @test_cuf_kernel_folds_local_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME: %[[VAL:.*]]: f32
+// CHECK-NOT: fir.rebox
+// CHECK: cuf.kernel
+// CHECK-NOT: fir.rebox
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK: fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_folds_local_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+ %c1_i32 = arith.constant 1 : i32
+ %c1 = arith.constant 1 : index
+ cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ fir.store %arg1 to %1 : !fir.ref<f32>
+ "fir.end"() : () -> ()
+ }
+ return
+}
+
+// -----
+
+// Without cuf.kernel, rebox SHOULD be folded normally (regression: the guard
+// must not affect ordinary host code).
+// CHECK-LABEL: func.func @test_no_cuf_folds_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+// CHECK-NOT: fir.rebox
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK: return %[[COOR]] : !fir.ref<f32>
+func.func @test_no_cuf_folds_rebox(%arg0: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+ %c1 = arith.constant 1 : index
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ return %1 : !fir.ref<f32>
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/193890
More information about the flang-commits
mailing list