[flang-commits] [flang] [flang][hlfir] Extend InlineHLFIRCopy to inline copy_out with copy-back (PR #202290)
Kareem Ergawy via flang-commits
flang-commits at lists.llvm.org
Mon Jun 8 01:37:41 PDT 2026
https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/202290
Rename `InlineHLFIRCopyIn` to `InlineHLFIRCopy` and extend it to inline the paired `hlfir.copy_out` operation. The copy_out is inlined at its original location, after the call, ensuring proper ordering of copy-back and deallocation.
Only inlines when no copy-back is required (intent(in)); intent(inout/out) pairs are left untransformed.
Based on https://github.com/llvm/llvm-project/pull/179096.
Co-Authored-By: Kazuaki Matsumura <kmatsumura at nvidia.com> (Original author of the changes).
Co-Authored-By: Claude Sonnet 4.6 <noreply at anthropic.com>
>From d78b2645c5745df93cc3f9d3617cd9253175ba33 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at gmail.com>
Date: Mon, 8 Jun 2026 01:12:16 -0700
Subject: [PATCH] [flang][hlfir] Extend InlineHLFIRCopy to inline copy_out with
copy-back
Rename `InlineHLFIRCopyIn` to `InlineHLFIRCopy` and extend it to inline
the paired `hlfir.copy_out` operation. The copy_out is inlined at its
original location, after the call, ensuring proper ordering of copy-back
and deallocation.
Only inlines when no copy-back is required (intent(in)); intent(inout/out)
pairs are left untransformed.
Based on https://github.com/llvm/llvm-project/pull/179096.
Co-Authored-By: Kazuaki Matsumura <kmatsumura at nvidia.com>
Co-Authored-By: Claude Sonnet 4.6 <noreply at anthropic.com>
---
flang/include/flang/Optimizer/HLFIR/Passes.td | 4 +-
.../Optimizer/HLFIR/Transforms/CMakeLists.txt | 2 +-
...ineHLFIRCopyIn.cpp => InlineHLFIRCopy.cpp} | 79 +++++++++++++------
flang/lib/Optimizer/Passes/Pipelines.cpp | 2 +-
...lfir-copy-in.fir => inline-hlfir-copy.fir} | 52 +++++-------
5 files changed, 80 insertions(+), 59 deletions(-)
rename flang/lib/Optimizer/HLFIR/Transforms/{InlineHLFIRCopyIn.cpp => InlineHLFIRCopy.cpp} (70%)
rename flang/test/HLFIR/{inline-hlfir-copy-in.fir => inline-hlfir-copy.fir} (78%)
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 4973715c1055c..2d57a50acb304 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -104,8 +104,8 @@ def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
"device code without inlining all array assignments.">];
}
-def InlineHLFIRCopyIn : Pass<"inline-hlfir-copy-in"> {
- let summary = "Inline hlfir.copy_in operations";
+def InlineHLFIRCopy : Pass<"inline-hlfir-copy"> {
+ let summary = "Inline hlfir.copy_in and hlfir.copy_out operations";
}
def PropagateFortranVariableAttributes : Pass<"propagate-fortran-attrs"> {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index c0c64c19e3826..5e9d57407ad09 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -7,7 +7,7 @@ add_flang_library(HLFIRTransforms
InlineElementals.cpp
InlineHLFIRAssign.cpp
SeparateAllocatableAssign.cpp
- InlineHLFIRCopyIn.cpp
+ InlineHLFIRCopy.cpp
LowerHLFIRIntrinsics.cpp
LowerHLFIROrderedAssignments.cpp
ScheduleOrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
similarity index 70%
rename from flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
rename to flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
index b4e89b0966e9c..f44db65a7a847 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
@@ -1,14 +1,19 @@
-//===- InlineHLFIRCopyIn.cpp - Inline hlfir.copy_in ops -------------------===//
+//===- InlineHLFIRCopy.cpp - Inline hlfir.copy_in/copy_out ops ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-// Transform hlfir.copy_in array operations into loop nests performing element
-// per element assignments. For simplicity, the inlining is done for trivial
-// data types when the copy_in does not require a corresponding copy_out and
-// when the input array is not behind a pointer. This may change in the future.
+// Transform hlfir.copy_in and hlfir.copy_out array operations into loop nests
+// performing element per element assignments. For simplicity, the inlining is
+// done for trivial data types when the input array is not behind a pointer.
+// This may change in the future.
+//
+// When the copy_in is inlined, the corresponding copy_out is also inlined.
+// Currently only intent(in) (deallocation-only) copy_out ops are inlined;
+// the copy_in/copy_out pair is left as-is when copy-back is required
+// (intent(inout/out)). Copy-back inlining may be added in the future.
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -21,15 +26,15 @@
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
namespace hlfir {
-#define GEN_PASS_DEF_INLINEHLFIRCOPYIN
+#define GEN_PASS_DEF_INLINEHLFIRCOPY
#include "flang/Optimizer/HLFIR/Passes.h.inc"
} // namespace hlfir
-#define DEBUG_TYPE "inline-hlfir-copy-in"
+#define DEBUG_TYPE "inline-hlfir-copy"
-static llvm::cl::opt<bool> noInlineHLFIRCopyIn(
- "no-inline-hlfir-copy-in",
- llvm::cl::desc("Do not inline hlfir.copy_in operations"),
+static llvm::cl::opt<bool> noInlineHLFIRCopy(
+ "no-inline-hlfir-copy",
+ llvm::cl::desc("Do not inline hlfir.copy_in/copy_out operations"),
llvm::cl::init(false));
namespace {
@@ -42,6 +47,26 @@ class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
mlir::PatternRewriter &rewriter) const override;
};
+// Inline a copy_out operation (deallocation only — no copy-back).
+// Generates: if (wasCopied) { freemem(temp) }
+static void inlineCopyOut(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value tempBox, mlir::Value wasCopied,
+ mlir::Type sequenceType) {
+ builder.genIfOp(loc, {}, wasCopied, /*withElseRegion=*/false).genThen([&]() {
+ mlir::Value box = fir::LoadOp::create(builder, loc, tempBox);
+ mlir::Value addr = fir::BoxAddrOp::create(builder, loc, box);
+ auto heapType = fir::HeapType::get(sequenceType);
+ mlir::Value heapAddr = fir::ConvertOp::create(builder, loc, heapType, addr);
+ fir::FreeMemOp::create(builder, loc, heapAddr);
+ });
+}
+
+// Note: We don't have a separate InlineCopyOutConversion pattern.
+// Copy_out inlining is handled by InlineCopyInConversion when it inlines
+// the paired copy_in. For copy_outs that aren't paired with an eligible
+// copy_in (e.g., optional args, assumed-rank, non-trivial types), the
+// copy_out is left as-is and will be lowered to a runtime call.
+
llvm::LogicalResult
InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
mlir::PatternRewriter &rewriter) const {
@@ -81,6 +106,12 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
mlir::Type sequenceType =
hlfir::getFortranElementOrSequenceType(inputVariable.getType());
fir::BoxType resultBoxType = fir::BoxType::get(sequenceType);
+
+ // Compute shape for use in the copy-in loop and temporary declaration.
+ mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
+ llvm::SmallVector<mlir::Value> extents =
+ hlfir::getIndexExtents(loc, builder, shape);
+
mlir::Value isContiguous =
fir::IsContiguousBoxOp::create(builder, loc, inputVariable);
mlir::Operation::result_range results =
@@ -99,9 +130,6 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
mlir::ValueRange{result, builder.createBool(loc, false)});
})
.genElse([&] {
- mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
- llvm::SmallVector<mlir::Value> extents =
- hlfir::getIndexExtents(loc, builder, shape);
llvm::StringRef tmpName{".tmp.copy_in"};
llvm::SmallVector<mlir::Value> lenParams;
mlir::Value alloc = builder.createHeapTemporary(
@@ -148,20 +176,25 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
mlir::OpResult resultBox = results[0];
mlir::OpResult needsCleanup = results[1];
- // Prepare the corresponding copyOut to free the temporary if it is required
+ // Inline the corresponding copyOut (deallocation only).
+ // Store the resultBox first since it's a box value.
auto alloca = fir::AllocaOp::create(builder, loc, resultBox.getType());
- auto store = fir::StoreOp::create(builder, loc, resultBox, alloca);
- rewriter.startOpModification(copyOut);
- copyOut->setOperand(0, store.getMemref());
- copyOut->setOperand(1, needsCleanup);
- rewriter.finalizeOpModification(copyOut);
+ fir::StoreOp::create(builder, loc, resultBox, alloca);
+
+ rewriter.setInsertionPoint(copyOut);
+ fir::FirOpBuilder copyOutBuilder(rewriter, copyOut.getOperation());
+ inlineCopyOut(copyOutBuilder, copyOut.getLoc(), alloca, needsCleanup,
+ sequenceType);
+
+ // Erase the copyOut since we've inlined it
+ rewriter.eraseOp(copyOut);
rewriter.replaceOp(copyIn, {resultBox, builder.genNot(loc, isContiguous)});
return mlir::success();
}
-class InlineHLFIRCopyInPass
- : public hlfir::impl::InlineHLFIRCopyInBase<InlineHLFIRCopyInPass> {
+class InlineHLFIRCopyPass
+ : public hlfir::impl::InlineHLFIRCopyBase<InlineHLFIRCopyPass> {
public:
void runOnOperation() override {
mlir::MLIRContext *context = &getContext();
@@ -172,14 +205,14 @@ class InlineHLFIRCopyInPass
mlir::GreedySimplifyRegionLevel::Disabled);
mlir::RewritePatternSet patterns(context);
- if (!noInlineHLFIRCopyIn) {
+ if (!noInlineHLFIRCopy) {
patterns.insert<InlineCopyInConversion>(context);
}
if (mlir::failed(mlir::applyPatternsGreedily(
getOperation(), std::move(patterns), config))) {
mlir::emitError(getOperation()->getLoc(),
- "failure in hlfir.copy_in inlining");
+ "failure in hlfir.copy_in/copy_out inlining");
signalPassFailure();
}
}
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 682e3e48e0a22..cbb0abbeef75d 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -311,7 +311,7 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
if (optLevel == llvm::OptimizationLevel::O3) {
addNestedPassToAllTopLevelOperations<PassConstructor>(
- pm, hlfir::createInlineHLFIRCopyIn);
+ pm, hlfir::createInlineHLFIRCopy);
}
} else {
// At O0, only inline scalar-to-array broadcasts. This avoids emitting
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy.fir
similarity index 78%
rename from flang/test/HLFIR/inline-hlfir-copy-in.fir
rename to flang/test/HLFIR/inline-hlfir-copy.fir
index f1da1da9f9a5c..d8a96ca2c0b04 100644
--- a/flang/test/HLFIR/inline-hlfir-copy-in.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy.fir
@@ -1,5 +1,5 @@
-// Test inlining of hlfir.copy_in
-// RUN: fir-opt --inline-hlfir-copy-in %s | FileCheck %s
+// Test inlining of hlfir.copy_in and hlfir.copy_out
+// RUN: fir-opt --inline-hlfir-copy %s | FileCheck %s
// Test inlining of hlfir.copy_in that does not require the array to be copied out
func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
@@ -72,16 +72,23 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
// CHECK: }
// CHECK: fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
// CHECK: }
+// CHECK: %[[VAL_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+// CHECK: fir.store %[[VAL_21:.*]]#0 to %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
// CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
// CHECK: %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
// CHECK: fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK: hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
+// CHECK: fir.if %[[VAL_21:.*]]#1 {
+// CHECK: %[[VAL_BOX:.*]] = fir.load %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+// CHECK: %[[VAL_ADDR:.*]] = fir.box_addr %[[VAL_BOX:.*]] : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK: %[[VAL_HEAP:.*]] = fir.convert %[[VAL_ADDR:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
+// CHECK: fir.freemem %[[VAL_HEAP:.*]] : !fir.heap<!fir.array<?xf64>>
+// CHECK: }
// CHECK: hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
// CHECK: return
// CHECK: }
-// Test not inlining of hlfir.copy_in that requires the array to be copied out
-func.func private @_test_no_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// Test not inlining of hlfir.copy_in when copy-back is required (intent(inout))
+func.func private @_test_inline_copy_in_with_copyback(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
%0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
%1 = fir.dummy_scope : !fir.dscope
%2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -113,33 +120,14 @@ func.func private @_test_no_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>
return
}
-// CHECK-LABEL: func.func private @_test_no_inline_copy_in(
-// CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
-// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
-// CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
-// CHECK: %[[VAL_3:.*]] = arith.constant 100 : i32
-// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
-// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
-// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_8:.*]]#0 : !fir.ref<i32>
-// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
-// CHECK: %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_5:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
-// CHECK: %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
-// CHECK: %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
-// CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
-// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
-// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
-// CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_5:.*]]:%[[VAL_13:.*]]#1:%[[VAL_5:.*]], %[[VAL_17:.*]]) shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
-// CHECK: %[[VAL_20:.*]]:2 = hlfir.copy_in %[[VAL_19:.*]] to %[[VAL_6:.*]] : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
-// CHECK: %[[VAL_21:.*]] = fir.box_addr %[[VAL_20:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-// CHECK: %[[VAL_22:.*]]:3 = hlfir.associate %[[VAL_3:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-// CHECK: fir.call @_QFPsb(%[[VAL_21:.*]], %[[VAL_22:.*]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK: hlfir.copy_out %[[VAL_6:.*]], %[[VAL_20:.*]]#1 to %[[VAL_19:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
-// CHECK: hlfir.end_associate %[[VAL_22:.*]]#1, %[[VAL_22:.*]]#2 : !fir.ref<i32>, i1
+// CHECK-LABEL: func.func private @_test_inline_copy_in_with_copyback(
+// CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
+// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
+// CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// CHECK: %[[TMP:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+// CHECK: %[[COPYIN:.*]]:2 = hlfir.copy_in %{{.*}} to %[[TMP]] : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+// CHECK: fir.call @_QFPsb(
+// CHECK: hlfir.copy_out %[[TMP]], %[[COPYIN]]#1 to %{{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
// CHECK: return
// CHECK: }
More information about the flang-commits
mailing list