[flang-commits] [flang] [flang][cuda] Preserve fir.rebox captured by cuf.kernel in SimplifyArrayCoorOp (PR #193837)
Zhen Wang via flang-commits
flang-commits at lists.llvm.org
Thu Apr 23 13:59:44 PDT 2026
https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/193837
SimplifyArrayCoorOp folds fir.rebox → fir.array_coor by rewriting the array_coor to consume the rebox's source box directly. This is unsafe when the array_coor is inside a cuf.kernel and the rebox is not: in CUF, a rebox of a device-attributed value (e.g. a dummy with cuf.data_attr = device, or the result of __tgt_acc_get_deviceptr) lowers to _FortranACUFAllocDescriptor, which copies the descriptor into managed memory for the GPU. Folding it out skips the copy, so the kernel dereferences a host-side descriptor on the device, causing cudaErrorIllegalAddress (or cudaErrorUnknown on H100 under ATS).
The pattern already guards this hazard for OpenACC via ACC_COMPUTE_AND_DATA_CONSTRUCT_OPS / ACC_DATA_ENTRY_OPS; this change adds the analogous guard for cuf::KernelOp.
>From 77f3c95fdba340e29953fd0da817f5c9ffc64949 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 23 Apr 2026 13:55:32 -0700
Subject: [PATCH] Preserve fir.rebox captured by cuf.kernel in
SimplifyArrayCoorOp
---
flang/lib/Optimizer/Dialect/CMakeLists.txt | 1 +
flang/lib/Optimizer/Dialect/FIROps.cpp | 10 +++
.../Fir/array-coor-canonicalization-cuf.fir | 67 +++++++++++++++++++
3 files changed, 78 insertions(+)
create mode 100644 flang/test/Fir/array-coor-canonicalization-cuf.fir
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 0581e18ab0763..a17cfe3a010de 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -20,6 +20,7 @@ add_flang_library(FIRDialect
FIROpsIncGen
FIRSafeTempArrayCopyAttrInterfaceIncGen
CUFAttrsIncGen
+ CUFOpsIncGen
intrinsics_gen
LINK_LIBS
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 4705033945611..0afbb90ca4e08 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
@@ -587,6 +588,15 @@ struct SimplifyArrayCoorOp : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
return mlir::isa<ACC_DATA_ENTRY_OPS>(u);
}))
return mlir::failure();
+ // Don't pull in rebox defined outside a cuf.kernel when the array_coor
+ // is inside that kernel. CUF codegen converts such a rebox into a
+ // managed-memory descriptor (via _FortranACUFAllocDescriptor) that the
+ // kernel needs to receive as its argument; folding the rebox away would
+ // leave the kernel capturing the host-side descriptor directly, causing
+ // illegal device dereferences at runtime.
+ if (auto kernel = op->getParentOfType<cuf::KernelOp>())
+ if (reboxOp->getParentOfType<cuf::KernelOp>() != kernel)
+ return mlir::failure();
boxedMemref = reboxOp.getBox();
boxedShape = reboxOp.getShape();
// Avoid pulling in rebox that performs reshaping.
diff --git a/flang/test/Fir/array-coor-canonicalization-cuf.fir b/flang/test/Fir/array-coor-canonicalization-cuf.fir
new file mode 100644
index 0000000000000..312290e62785e
--- /dev/null
+++ b/flang/test/Fir/array-coor-canonicalization-cuf.fir
@@ -0,0 +1,67 @@
+// RUN: fir-opt --canonicalize --split-input-file %s | FileCheck %s
+
+// Verify that fir.rebox is NOT folded into fir.array_coor when the rebox
+// is defined outside a cuf.kernel and the array_coor is inside that kernel.
+// CUF codegen turns such a rebox into a managed-memory descriptor (via
+// _FortranACUFAllocDescriptor) that the kernel needs to receive as its
+// argument; folding the rebox away would leave the kernel capturing the
+// host-side descriptor directly, causing illegal device dereferences at
+// runtime.
+// CHECK-LABEL: func.func @test_cuf_kernel_preserves_captured_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME: %[[VAL:.*]]: f32
+// CHECK: %[[REBOX:.*]] = fir.rebox %[[BOX]]
+// CHECK: cuf.kernel
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[REBOX]]
+// CHECK: fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_preserves_captured_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+ %c1_i32 = arith.constant 1 : i32
+ %c1 = arith.constant 1 : index
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ fir.store %arg1 to %1 : !fir.ref<f32>
+ "fir.end"() : () -> ()
+ }
+ return
+}
+
+// -----
+
+// When both the rebox and the array_coor are inside the same cuf.kernel, the
+// capture boundary is not crossed and folding must still happen.
+// CHECK-LABEL: func.func @test_cuf_kernel_folds_local_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>
+// CHECK-SAME: %[[VAL:.*]]: f32
+// CHECK-NOT: fir.rebox
+// CHECK: cuf.kernel
+// CHECK-NOT: fir.rebox
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK: fir.store %[[VAL]] to %[[COOR]]
+func.func @test_cuf_kernel_folds_local_rebox(%arg0: !fir.box<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>}, %arg1: f32) {
+ %c1_i32 = arith.constant 1 : i32
+ %c1 = arith.constant 1 : index
+ cuf.kernel<<<%c1_i32, %c1_i32>>> (%arg2 : index) = (%c1 : index) to (%c1 : index) step (%c1 : index) {
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ fir.store %arg1 to %1 : !fir.ref<f32>
+ "fir.end"() : () -> ()
+ }
+ return
+}
+
+// -----
+
+// Without cuf.kernel, rebox SHOULD be folded normally (regression: the guard
+// must not affect ordinary host code).
+// CHECK-LABEL: func.func @test_no_cuf_folds_rebox(
+// CHECK-SAME: %[[BOX:.*]]: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+// CHECK-NOT: fir.rebox
+// CHECK: %[[COOR:.*]] = fir.array_coor %[[BOX]]
+// CHECK: return %[[COOR]] : !fir.ref<f32>
+func.func @test_no_cuf_folds_rebox(%arg0: !fir.box<!fir.array<?xf32>>) -> !fir.ref<f32> {
+ %c1 = arith.constant 1 : index
+ %0 = fir.rebox %arg0 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+ %1 = fir.array_coor %0 %c1 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+ return %1 : !fir.ref<f32>
+}
More information about the flang-commits
mailing list