[flang-commits] [flang] [flang][cuda] Preserve fir.rebox captured by cuf.kernel in SimplifyArrayCoorOp (PR #193837)

via flang-commits flang-commits at lists.llvm.org
Thu Apr 23 14:00:14 PDT 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-flang-fir-hlfir

Author: Zhen Wang (wangzpgi)

<details>
<summary>Changes</summary>

SimplifyArrayCoorOp folds fir.rebox → fir.array_coor by rewriting the array_coor to consume the rebox's source box directly. This is unsafe when the array_coor is inside a cuf.kernel and the rebox is not: in CUF, a rebox of a device-attributed value (e.g. a dummy with cuf.data_attr = device, or the result of __tgt_acc_get_deviceptr) lowers to _FortranACUFAllocDescriptor, which copies the descriptor into managed memory for the GPU. Folding it out skips the copy, so the kernel dereferences a host-side descriptor on the device, causing cudaErrorIllegalAddress (or cudaErrorUnknown on H100 under ATS).

The pattern already guards this hazard for OpenACC via ACC_COMPUTE_AND_DATA_CONSTRUCT_OPS / ACC_DATA_ENTRY_OPS; this change adds the analogous guard for cuf::KernelOp.

---
Full diff: https://github.com/llvm/llvm-project/pull/193837.diff


3 Files Affected:

- (modified) flang/lib/Optimizer/Dialect/CMakeLists.txt (+1) 
- (modified) flang/lib/Optimizer/Dialect/FIROps.cpp (+10) 
- (added) flang/test/Fir/array-coor-canonicalization-cuf.fir (+67) 


``````````diff
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 0581e18ab0763..a17cfe3a010de 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -20,6 +20,7 @@ add_flang_library(FIRDialect
   FIROpsIncGen
   FIRSafeTempArrayCopyAttrInterfaceIncGen
   CUFAttrsIncGen
+  CUFOpsIncGen
   intrinsics_gen
 
   LINK_LIBS
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 4705033945611..0afbb90ca4e08 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
@@ -587,6 +588,15 @@ struct SimplifyArrayCoorOp : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
             return mlir::isa<ACC_DATA_ENTRY_OPS>(u);
           }))
         return mlir::failure();
+      // Don't pull in rebox defined outside a cuf.kernel when the array_coor
+      // is inside that kernel. CUF codegen converts such a rebox into a
+      // managed-memory descriptor (via _FortranACUFAllocDescriptor) that the
+      // kernel needs to receive as its argument; folding the rebox away would
+      // leave the kernel capturing the host-side descriptor directly, causing
+      // illegal device dereferences at runtime.
+      if (auto kernel = op->getParentOfType<cuf::KernelOp>())
+        if (reboxOp->getParentOfType<cuf::KernelOp>() != kernel)
+          return mlir::failure();
       boxedMemref = reboxOp.getBox();
       boxedShape = reboxOp.getShape();
       // Avoid pulling in rebox that performs reshaping.
diff --git a/flang/test/Fir/array-coor-canonicalization-cuf.fir b/flang/test/Fir/array-coor-canonicalization-cuf.fir
new file mode 100644
index 0000000000000..312290e62785e
--- /dev/null
+++ b/flang/test/Fir/array-coor-canonicalization-cuf.fir
@@ -0,0 +1,67 @@
+// RUN: fir-opt --canonicalize --split-input-file %s | FileCheck %s
+
+// Verify that fir.rebox is NOT folded into fir.array_coor when the rebox
+// is defined outside a cuf.kernel and the array_coor is inside that kernel.
+// CUF codegen turns such a rebox into a managed-memory descriptor (via
+// _FortranACUFAllocDescriptor) that the kernel needs to receive as its
+// argument; folding the rebox away would leave the kernel capturing the
+// host-side descriptor directly, causing illegal device dereferences at
+// runtime.
+// CHECK-LABEL:   func.func @test_cuf_kernel_preserves_captured_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME:        %[[VAL:.*]]: f32
+// CHECK:           %[[REBOX:.*]] = fir.rebox %[[BOX]]
+// CHECK:           cuf.kernel
+// CHECK:             %[[COOR:.*]] = fir.array_coor %[[REBOX]]
+// CHECK:             fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_preserves_captured_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+  %c1_i32 = arith.constant 1 : i32
+  %c1 = arith.constant 1 : index
+  %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+    %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    fir.store %arg1 to %1 : !fir.ref<f32>
+    "fir.end"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// When both the rebox and the array_coor are inside the same cuf.kernel, the
+// capture boundary is not crossed and folding must still happen.
+// CHECK-LABEL:   func.func @test_cuf_kernel_folds_local_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME:        %[[VAL:.*]]: f32
+// CHECK-NOT:       fir.rebox
+// CHECK:           cuf.kernel
+// CHECK-NOT:         fir.rebox
+// CHECK:             %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK:             fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_folds_local_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+  %c1_i32 = arith.constant 1 : i32
+  %c1 = arith.constant 1 : index
+  cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+    %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+    %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    fir.store %arg1 to %1 : !fir.ref<f32>
+    "fir.end"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// Without cuf.kernel, rebox SHOULD be folded normally (regression: the guard
+// must not affect ordinary host code).
+// CHECK-LABEL:   func.func @test_no_cuf_folds_rebox(
+// CHECK-SAME:        %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+// CHECK-NOT:       fir.rebox
+// CHECK:           %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK:           return %[[COOR]] : !fir.ref<f32>
+func.func @test_no_cuf_folds_rebox(%arg0: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+  %c1 = arith.constant 1 : index
+  %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+  return %1 : !fir.ref<f32>
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/193837


More information about the flang-commits mailing list