[flang-commits] [flang] [flang] Inline hlfir.copy_in for trivial types (PR #138718)

Kajetan Puchalski via flang-commits flang-commits at lists.llvm.org
Wed May 28 12:53:39 PDT 2025


https://github.com/mrkajetanp updated https://github.com/llvm/llvm-project/pull/138718

>From 005c599ffcbb1ee257ad8ca510d8ecd649fcab7b Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Thu, 10 Apr 2025 14:04:52 +0000
Subject: [PATCH 1/5] [flang] Inline hlfir.copy_in for trivial types

hlfir.copy_in implements copying non-contiguous array slices for functions
that take in arrays required to be contiguous through a flang-rt function
that calls memcpy/memmove separately on each element.

For large arrays of trivial types, this can incur considerable overhead
compared to a plain copy loop that is better able to take advantage of
hardware pipelines.

To address that, extend the InlineHLFIRAssign optimisation pass with a
new pattern for inlining hlfir.copy_in operations for trivial types.

For the time being, the pattern is only applied in cases where the
copy-in does not require a corresponding copy-out, such as when the
function being called declares the array parameter as intent(in).

Applying this optimisation reduces the runtime of thornado-mini's
DeleptonizationProblem by a factor of about 1/3rd.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 6e209cce07ad4..38c684eaceb7d 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -13,6 +13,7 @@
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
@@ -127,6 +128,121 @@ class InlineHLFIRAssignConversion
   }
 };
 
+class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::CopyInOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::CopyInOp copyIn,
+                  mlir::PatternRewriter &rewriter) const override;
+};
+
+llvm::LogicalResult
+InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
+                                        mlir::PatternRewriter &rewriter) const {
+  fir::FirOpBuilder builder(rewriter, copyIn.getOperation());
+  mlir::Location loc = copyIn.getLoc();
+  hlfir::Entity inputVariable{copyIn.getVar()};
+  if (!fir::isa_trivial(inputVariable.getFortranElementType()))
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp's data type is not trivial");
+
+  if (fir::isPointerType(inputVariable.getType()))
+    return rewriter.notifyMatchFailure(
+        copyIn, "CopyInOp's input variable is a pointer");
+
+  // There should be exactly one user of WasCopied - the corresponding
+  // CopyOutOp.
+  if (copyIn.getWasCopied().getUses().empty())
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp's WasCopied has no uses");
+  // The copy out should always be present, either to actually copy or just
+  // deallocate memory.
+  auto *copyOut =
+      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser();
+
+  if (!mlir::isa<hlfir::CopyOutOp>(copyOut))
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp has no direct CopyOut");
+
+  // Only inline the copy_in when copy_out does not need to be done, i.e. in
+  // case of intent(in).
+  if (::llvm::cast<hlfir::CopyOutOp>(copyOut).getVar())
+    return rewriter.notifyMatchFailure(copyIn, "CopyIn needs a copy-out");
+
+  inputVariable =
+      hlfir::derefPointersAndAllocatables(loc, builder, inputVariable);
+  mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
+  mlir::Value isContiguous =
+      builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
+  auto results =
+      builder
+          .genIfOp(loc, {resultAddrType, builder.getI1Type()}, isContiguous,
+                   /*withElseRegion=*/true)
+          .genThen([&]() {
+            mlir::Value falseVal = builder.create<mlir::arith::ConstantOp>(
+                loc, builder.getI1Type(), builder.getBoolAttr(false));
+            builder.create<fir::ResultOp>(
+                loc, mlir::ValueRange{inputVariable, falseVal});
+          })
+          .genElse([&] {
+            auto [temp, cleanup] =
+                hlfir::createTempFromMold(loc, builder, inputVariable);
+            mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
+            llvm::SmallVector<mlir::Value> extents =
+                hlfir::getIndexExtents(loc, builder, shape);
+            hlfir::LoopNest loopNest = hlfir::genLoopNest(
+                loc, builder, extents, /*isUnordered=*/true,
+                flangomp::shouldUseWorkshareLowering(copyIn));
+            builder.setInsertionPointToStart(loopNest.body);
+            auto elem = hlfir::getElementAt(loc, builder, inputVariable,
+                                            loopNest.oneBasedIndices);
+            elem = hlfir::loadTrivialScalar(loc, builder, elem);
+            auto tempElem = hlfir::getElementAt(loc, builder, temp,
+                                                loopNest.oneBasedIndices);
+            builder.create<hlfir::AssignOp>(loc, elem, tempElem);
+            builder.setInsertionPointAfter(loopNest.outerOp);
+
+            mlir::Value result;
+            // Make sure the result is always a boxed array by boxing it
+            // ourselves if need be.
+            if (mlir::isa<fir::BaseBoxType>(temp.getType())) {
+              result = temp;
+            } else {
+              auto refTy =
+                  fir::ReferenceType::get(temp.getElementOrSequenceType());
+              auto refVal = builder.createConvert(loc, refTy, temp);
+              result =
+                  builder.create<fir::EmboxOp>(loc, resultAddrType, refVal);
+            }
+
+            builder.create<fir::ResultOp>(loc,
+                                          mlir::ValueRange{result, cleanup});
+          })
+          .getResults();
+
+  auto addr = results[0];
+  auto needsCleanup = results[1];
+
+  builder.setInsertionPoint(copyOut);
+  builder.genIfOp(loc, {}, needsCleanup, false).genThen([&] {
+    auto boxAddr = builder.create<fir::BoxAddrOp>(loc, addr);
+    auto heapType = fir::HeapType::get(fir::BoxValue(addr).getBaseTy());
+    auto heapVal = builder.createConvert(loc, heapType, boxAddr.getResult());
+    builder.create<fir::FreeMemOp>(loc, heapVal);
+  });
+  rewriter.eraseOp(copyOut);
+
+  auto tempBox = copyIn.getTempBox();
+
+  rewriter.replaceOp(copyIn, {addr, builder.genNot(loc, isContiguous)});
+
+  // The TempBox is only needed for flang-rt calls which we're no longer
+  // generating.
+  rewriter.eraseOp(tempBox.getDefiningOp());
+  return mlir::success();
+}
+
 class InlineHLFIRAssignPass
     : public hlfir::impl::InlineHLFIRAssignBase<InlineHLFIRAssignPass> {
 public:
@@ -140,6 +256,7 @@ class InlineHLFIRAssignPass
 
     mlir::RewritePatternSet patterns(context);
     patterns.insert<InlineHLFIRAssignConversion>(context);
+    patterns.insert<InlineCopyInConversion>(context);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {

>From 26d2e491acd54ef942af32c8361e97b24d190625 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Wed, 7 May 2025 16:04:07 +0000
Subject: [PATCH 2/5] Add tests

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 flang/test/HLFIR/inline-hlfir-assign.fir | 144 +++++++++++++++++++++++
 1 file changed, 144 insertions(+)

diff --git a/flang/test/HLFIR/inline-hlfir-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir
index f834e7971e3d5..df7681b9c5c16 100644
--- a/flang/test/HLFIR/inline-hlfir-assign.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign.fir
@@ -353,3 +353,147 @@ func.func @_QPtest_expr_rhs(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.
 // CHECK:           hlfir.destroy %[[VAL_10]] : !hlfir.expr<?x!fir.logical<4>>
 // CHECK:           return
 // CHECK:         }
+
+// Test inlining of hlfir.copy_in that does not require the array to be copied out
+func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+  %5 = fir.load %2#0 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> i64
+  %c1 = arith.constant 1 : index
+  %c1_0 = arith.constant 1 : index
+  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+  %c1_1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %8 = arith.subi %7#1, %c1 : index
+  %9 = arith.addi %8, %c1_1 : index
+  %10 = arith.divsi %9, %c1_1 : index
+  %11 = arith.cmpi sgt, %10, %c0 : index
+  %12 = arith.select %11, %10, %c0 : index
+  %13 = fir.load %3#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> i64
+  %15 = fir.shape %12 : (index) -> !fir.shape<1>
+  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+  %c100_i32 = arith.constant 100 : i32
+  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  fir.call @_QFPsb(%18, %19#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+  hlfir.copy_out %0, %17#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1) -> ()
+  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
+  return
+}
+
+// CHECK-LABEL:   func.func private @_test_inline_copy_in(
+// CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
+// CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
+// CHECK-SAME:                                          %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// CHECK:    %[[VAL_3:.*]] = arith.constant true
+// CHECK:    %[[VAL_4:.*]] = arith.constant false
+// CHECK:    %[[VAL_5:.*]] = arith.constant 100 : i32
+// CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:    %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:    %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_22:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_7:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
+// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
+// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
+// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_7:.*]]:%[[VAL_13:.*]]#1:%[[VAL_7:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:    %[[VAL_20:.*]] = fir.is_contiguous_box %[[VAL_19:.*]] whole : (!fir.box<!fir.array<?xf64>>) -> i1
+// CHECK:    %[[VAL_21:.*]]:2 = fir.if %[[VAL_20:.*]] -> (!fir.box<!fir.array<?xf64>>, i1) {
+// CHECK:      fir.result %[[VAL_19:.*]], %[[VAL_4:.*]] : !fir.box<!fir.array<?xf64>>, i1
+// CHECK:    } else {
+// CHECK:      %[[VAL_24:.*]] = fir.allocmem !fir.array<?xf64>, %[[VAL_15:.*]] {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:      %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_24:.*]](%[[VAL_18:.*]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.heap<!fir.array<?xf64>>)
+// CHECK:      fir.do_loop %arg3 = %[[VAL_7:.*]] to %[[VAL_15:.*]] step %[[VAL_7:.*]] unordered {
+// CHECK:        %[[VAL_26:.*]] = hlfir.designate %[[VAL_19:.*]] (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:        %[[VAL_27:.*]] = fir.load %[[VAL_26:.*]] : !fir.ref<f64>
+// CHECK:        %[[VAL_28:.*]] = hlfir.designate %[[VAL_25:.*]]#0 (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:        hlfir.assign %[[VAL_27:.*]] to %[[VAL_28:.*]] : f64, !fir.ref<f64>
+// CHECK:      }
+// CHECK:      fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
+// CHECK:    }
+// CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+// CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+// CHECK:    fir.if %[[VAL_21:.*]]#1 {
+// CHECK:      %[[VAL_24:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:      %[[VAL_25:.*]] = fir.convert %[[VAL_24:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
+// CHECK:      fir.freemem %[[VAL_25:.*]] : !fir.heap<!fir.array<?xf64>>
+// CHECK:    }
+// CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
+// CHECK:    return
+// CHECK:  }
+
+// Test not inlining of hlfir.copy_in that requires the array to be copied out
+func.func private @_test_no_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+  %5 = fir.load %2#0 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> i64
+  %c1 = arith.constant 1 : index
+  %c1_0 = arith.constant 1 : index
+  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+  %c1_1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %8 = arith.subi %7#1, %c1 : index
+  %9 = arith.addi %8, %c1_1 : index
+  %10 = arith.divsi %9, %c1_1 : index
+  %11 = arith.cmpi sgt, %10, %c0 : index
+  %12 = arith.select %11, %10, %c0 : index
+  %13 = fir.load %3#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> i64
+  %15 = fir.shape %12 : (index) -> !fir.shape<1>
+  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+  %c100_i32 = arith.constant 100 : i32
+  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  fir.call @_QFPsb(%18, %19#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+  hlfir.copy_out %0, %17#1 to %16 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
+  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
+  return
+}
+
+// CHECK-LABEL:  func.func private @_test_no_inline_copy_in(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
+// CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
+// CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// CHECK:    %[[VAL_3:.*]] = arith.constant 100 : i32
+// CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:    %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+// CHECK:    %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:    %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_8:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_5:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
+// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
+// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
+// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_5:.*]]:%[[VAL_13:.*]]#1:%[[VAL_5:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:    %[[VAL_20:.*]]:2 = hlfir.copy_in %[[VAL_19:.*]] to %[[VAL_6:.*]] : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+// CHECK:    %[[VAL_21:.*]] = fir.box_addr %[[VAL_20:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:    %[[VAL_22:.*]]:3 = hlfir.associate %[[VAL_3:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+// CHECK:    fir.call @_QFPsb(%[[VAL_21:.*]], %[[VAL_22:.*]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+// CHECK:    hlfir.copy_out %[[VAL_6:.*]], %[[VAL_20:.*]]#1 to %[[VAL_19:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
+// CHECK:    hlfir.end_associate %[[VAL_22:.*]]#1, %[[VAL_22:.*]]#2 : !fir.ref<i32>, i1
+// CHECK:    return
+// CHECK:  }

>From e51db2f45ad82798937b57e2b5c08e7bcc66deed Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Thu, 8 May 2025 15:15:56 +0000
Subject: [PATCH 3/5] Address Tom's review comments

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 38c684eaceb7d..dc545ece8adff 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -158,16 +158,16 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
                                        "CopyInOp's WasCopied has no uses");
   // The copy out should always be present, either to actually copy or just
   // deallocate memory.
-  auto *copyOut =
-      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser();
+  auto copyOut = mlir::dyn_cast<hlfir::CopyOutOp>(
+      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser());
 
-  if (!mlir::isa<hlfir::CopyOutOp>(copyOut))
+  if (!copyOut)
     return rewriter.notifyMatchFailure(copyIn,
                                        "CopyInOp has no direct CopyOut");
 
   // Only inline the copy_in when copy_out does not need to be done, i.e. in
   // case of intent(in).
-  if (::llvm::cast<hlfir::CopyOutOp>(copyOut).getVar())
+  if (copyOut.getVar())
     return rewriter.notifyMatchFailure(copyIn, "CopyIn needs a copy-out");
 
   inputVariable =
@@ -175,7 +175,7 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
   mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
   mlir::Value isContiguous =
       builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
-  auto results =
+  mlir::Operation::result_range results =
       builder
           .genIfOp(loc, {resultAddrType, builder.getI1Type()}, isContiguous,
                    /*withElseRegion=*/true)
@@ -195,11 +195,11 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
                 loc, builder, extents, /*isUnordered=*/true,
                 flangomp::shouldUseWorkshareLowering(copyIn));
             builder.setInsertionPointToStart(loopNest.body);
-            auto elem = hlfir::getElementAt(loc, builder, inputVariable,
-                                            loopNest.oneBasedIndices);
+            hlfir::Entity elem = hlfir::getElementAt(
+                loc, builder, inputVariable, loopNest.oneBasedIndices);
             elem = hlfir::loadTrivialScalar(loc, builder, elem);
-            auto tempElem = hlfir::getElementAt(loc, builder, temp,
-                                                loopNest.oneBasedIndices);
+            hlfir::Entity tempElem = hlfir::getElementAt(
+                loc, builder, temp, loopNest.oneBasedIndices);
             builder.create<hlfir::AssignOp>(loc, elem, tempElem);
             builder.setInsertionPointAfter(loopNest.outerOp);
 
@@ -209,9 +209,9 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
             if (mlir::isa<fir::BaseBoxType>(temp.getType())) {
               result = temp;
             } else {
-              auto refTy =
+              fir::ReferenceType refTy =
                   fir::ReferenceType::get(temp.getElementOrSequenceType());
-              auto refVal = builder.createConvert(loc, refTy, temp);
+              mlir::Value refVal = builder.createConvert(loc, refTy, temp);
               result =
                   builder.create<fir::EmboxOp>(loc, resultAddrType, refVal);
             }
@@ -221,25 +221,30 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
           })
           .getResults();
 
-  auto addr = results[0];
-  auto needsCleanup = results[1];
+  mlir::OpResult addr = results[0];
+  mlir::OpResult needsCleanup = results[1];
 
   builder.setInsertionPoint(copyOut);
-  builder.genIfOp(loc, {}, needsCleanup, false).genThen([&] {
+  builder.genIfOp(loc, {}, needsCleanup, /*withElseRegion=*/false).genThen([&] {
     auto boxAddr = builder.create<fir::BoxAddrOp>(loc, addr);
-    auto heapType = fir::HeapType::get(fir::BoxValue(addr).getBaseTy());
-    auto heapVal = builder.createConvert(loc, heapType, boxAddr.getResult());
+    fir::HeapType heapType =
+        fir::HeapType::get(fir::BoxValue(addr).getBaseTy());
+    mlir::Value heapVal =
+        builder.createConvert(loc, heapType, boxAddr.getResult());
     builder.create<fir::FreeMemOp>(loc, heapVal);
   });
   rewriter.eraseOp(copyOut);
 
-  auto tempBox = copyIn.getTempBox();
+  mlir::Value tempBox = copyIn.getTempBox();
 
   rewriter.replaceOp(copyIn, {addr, builder.genNot(loc, isContiguous)});
 
   // The TempBox is only needed for flang-rt calls which we're no longer
-  // generating.
+  // generating. It should have no uses left at this stage.
+  if (!tempBox.getUses().empty())
+    return mlir::failure();
   rewriter.eraseOp(tempBox.getDefiningOp());
+
   return mlir::success();
 }
 

>From c1af5b1ead7a560112c9896b1cb2bac48b865df3 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Thu, 22 May 2025 13:37:53 +0000
Subject: [PATCH 4/5] Separate copy_in inlining into its own pass, add flag

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 flang/include/flang/Optimizer/HLFIR/Passes.td |   4 +
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |   1 +
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 122 ------------
 .../HLFIR/Transforms/InlineHLFIRCopyIn.cpp    | 180 ++++++++++++++++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   5 +
 flang/test/HLFIR/inline-hlfir-assign.fir      | 144 --------------
 flang/test/HLFIR/inline-hlfir-copy-in.fir     | 146 ++++++++++++++
 7 files changed, 336 insertions(+), 266 deletions(-)
 create mode 100644 flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
 create mode 100644 flang/test/HLFIR/inline-hlfir-copy-in.fir

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index d445140118073..04d7aec5fe489 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -69,6 +69,10 @@ def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
   let summary = "Inline hlfir.assign operations";
 }
 
+def InlineHLFIRCopyIn : Pass<"inline-hlfir-copy-in"> {
+  let summary = "Inline hlfir.copy_in operations";
+}
+
 def PropagateFortranVariableAttributes : Pass<"propagate-fortran-attrs"> {
   let summary = "Propagate FortranVariableFlagsAttr attributes through HLFIR";
 }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index d959428ebd203..cc74273d9c5d9 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(HLFIRTransforms
   ConvertToFIR.cpp
   InlineElementals.cpp
   InlineHLFIRAssign.cpp
+  InlineHLFIRCopyIn.cpp
   LowerHLFIRIntrinsics.cpp
   LowerHLFIROrderedAssignments.cpp
   ScheduleOrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index dc545ece8adff..6e209cce07ad4 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -13,7 +13,6 @@
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
-#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
@@ -128,126 +127,6 @@ class InlineHLFIRAssignConversion
   }
 };
 
-class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
-public:
-  using mlir::OpRewritePattern<hlfir::CopyInOp>::OpRewritePattern;
-
-  llvm::LogicalResult
-  matchAndRewrite(hlfir::CopyInOp copyIn,
-                  mlir::PatternRewriter &rewriter) const override;
-};
-
-llvm::LogicalResult
-InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
-                                        mlir::PatternRewriter &rewriter) const {
-  fir::FirOpBuilder builder(rewriter, copyIn.getOperation());
-  mlir::Location loc = copyIn.getLoc();
-  hlfir::Entity inputVariable{copyIn.getVar()};
-  if (!fir::isa_trivial(inputVariable.getFortranElementType()))
-    return rewriter.notifyMatchFailure(copyIn,
-                                       "CopyInOp's data type is not trivial");
-
-  if (fir::isPointerType(inputVariable.getType()))
-    return rewriter.notifyMatchFailure(
-        copyIn, "CopyInOp's input variable is a pointer");
-
-  // There should be exactly one user of WasCopied - the corresponding
-  // CopyOutOp.
-  if (copyIn.getWasCopied().getUses().empty())
-    return rewriter.notifyMatchFailure(copyIn,
-                                       "CopyInOp's WasCopied has no uses");
-  // The copy out should always be present, either to actually copy or just
-  // deallocate memory.
-  auto copyOut = mlir::dyn_cast<hlfir::CopyOutOp>(
-      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser());
-
-  if (!copyOut)
-    return rewriter.notifyMatchFailure(copyIn,
-                                       "CopyInOp has no direct CopyOut");
-
-  // Only inline the copy_in when copy_out does not need to be done, i.e. in
-  // case of intent(in).
-  if (copyOut.getVar())
-    return rewriter.notifyMatchFailure(copyIn, "CopyIn needs a copy-out");
-
-  inputVariable =
-      hlfir::derefPointersAndAllocatables(loc, builder, inputVariable);
-  mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
-  mlir::Value isContiguous =
-      builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
-  mlir::Operation::result_range results =
-      builder
-          .genIfOp(loc, {resultAddrType, builder.getI1Type()}, isContiguous,
-                   /*withElseRegion=*/true)
-          .genThen([&]() {
-            mlir::Value falseVal = builder.create<mlir::arith::ConstantOp>(
-                loc, builder.getI1Type(), builder.getBoolAttr(false));
-            builder.create<fir::ResultOp>(
-                loc, mlir::ValueRange{inputVariable, falseVal});
-          })
-          .genElse([&] {
-            auto [temp, cleanup] =
-                hlfir::createTempFromMold(loc, builder, inputVariable);
-            mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
-            llvm::SmallVector<mlir::Value> extents =
-                hlfir::getIndexExtents(loc, builder, shape);
-            hlfir::LoopNest loopNest = hlfir::genLoopNest(
-                loc, builder, extents, /*isUnordered=*/true,
-                flangomp::shouldUseWorkshareLowering(copyIn));
-            builder.setInsertionPointToStart(loopNest.body);
-            hlfir::Entity elem = hlfir::getElementAt(
-                loc, builder, inputVariable, loopNest.oneBasedIndices);
-            elem = hlfir::loadTrivialScalar(loc, builder, elem);
-            hlfir::Entity tempElem = hlfir::getElementAt(
-                loc, builder, temp, loopNest.oneBasedIndices);
-            builder.create<hlfir::AssignOp>(loc, elem, tempElem);
-            builder.setInsertionPointAfter(loopNest.outerOp);
-
-            mlir::Value result;
-            // Make sure the result is always a boxed array by boxing it
-            // ourselves if need be.
-            if (mlir::isa<fir::BaseBoxType>(temp.getType())) {
-              result = temp;
-            } else {
-              fir::ReferenceType refTy =
-                  fir::ReferenceType::get(temp.getElementOrSequenceType());
-              mlir::Value refVal = builder.createConvert(loc, refTy, temp);
-              result =
-                  builder.create<fir::EmboxOp>(loc, resultAddrType, refVal);
-            }
-
-            builder.create<fir::ResultOp>(loc,
-                                          mlir::ValueRange{result, cleanup});
-          })
-          .getResults();
-
-  mlir::OpResult addr = results[0];
-  mlir::OpResult needsCleanup = results[1];
-
-  builder.setInsertionPoint(copyOut);
-  builder.genIfOp(loc, {}, needsCleanup, /*withElseRegion=*/false).genThen([&] {
-    auto boxAddr = builder.create<fir::BoxAddrOp>(loc, addr);
-    fir::HeapType heapType =
-        fir::HeapType::get(fir::BoxValue(addr).getBaseTy());
-    mlir::Value heapVal =
-        builder.createConvert(loc, heapType, boxAddr.getResult());
-    builder.create<fir::FreeMemOp>(loc, heapVal);
-  });
-  rewriter.eraseOp(copyOut);
-
-  mlir::Value tempBox = copyIn.getTempBox();
-
-  rewriter.replaceOp(copyIn, {addr, builder.genNot(loc, isContiguous)});
-
-  // The TempBox is only needed for flang-rt calls which we're no longer
-  // generating. It should have no uses left at this stage.
-  if (!tempBox.getUses().empty())
-    return mlir::failure();
-  rewriter.eraseOp(tempBox.getDefiningOp());
-
-  return mlir::success();
-}
-
 class InlineHLFIRAssignPass
     : public hlfir::impl::InlineHLFIRAssignBase<InlineHLFIRAssignPass> {
 public:
@@ -261,7 +140,6 @@ class InlineHLFIRAssignPass
 
     mlir::RewritePatternSet patterns(context);
     patterns.insert<InlineHLFIRAssignConversion>(context);
-    patterns.insert<InlineCopyInConversion>(context);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
new file mode 100644
index 0000000000000..1e2aecaf535a0
--- /dev/null
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
@@ -0,0 +1,180 @@
+//===- InlineHLFIRCopyIn.cpp - Inline hlfir.copy_in ops -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Transform hlfir.copy_in array operations into loop nests performing element
+// per element assignments. For simplicity, the inlining is done for trivial
+// data types when the copy_in does not require a corresponding copy_out and
+// when the input array is not behind a pointer. This may change in the future.
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace hlfir {
+#define GEN_PASS_DEF_INLINEHLFIRCOPYIN
+#include "flang/Optimizer/HLFIR/Passes.h.inc"
+} // namespace hlfir
+
+#define DEBUG_TYPE "inline-hlfir-copy-in"
+
+static llvm::cl::opt<bool> noInlineHLFIRCopyIn(
+    "no-inline-hlfir-copy-in",
+    llvm::cl::desc("Do not inline hlfir.copy_in operations"),
+    llvm::cl::init(false));
+
+namespace {
+class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::CopyInOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::CopyInOp copyIn,
+                  mlir::PatternRewriter &rewriter) const override;
+};
+
+llvm::LogicalResult
+InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
+                                        mlir::PatternRewriter &rewriter) const {
+  fir::FirOpBuilder builder(rewriter, copyIn.getOperation());
+  mlir::Location loc = copyIn.getLoc();
+  hlfir::Entity inputVariable{copyIn.getVar()};
+  if (!fir::isa_trivial(inputVariable.getFortranElementType()))
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp's data type is not trivial");
+
+  if (fir::isPointerType(inputVariable.getType()))
+    return rewriter.notifyMatchFailure(
+        copyIn, "CopyInOp's input variable is a pointer");
+
+  // There should be exactly one user of WasCopied - the corresponding
+  // CopyOutOp.
+  if (copyIn.getWasCopied().getUses().empty())
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp's WasCopied has no uses");
+  // The copy out should always be present, either to actually copy or just
+  // deallocate memory.
+  auto copyOut = mlir::dyn_cast<hlfir::CopyOutOp>(
+      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser());
+
+  if (!copyOut)
+    return rewriter.notifyMatchFailure(copyIn,
+                                       "CopyInOp has no direct CopyOut");
+
+  // Only inline the copy_in when copy_out does not need to be done, i.e. in
+  // case of intent(in).
+  if (copyOut.getVar())
+    return rewriter.notifyMatchFailure(copyIn, "CopyIn needs a copy-out");
+
+  inputVariable =
+      hlfir::derefPointersAndAllocatables(loc, builder, inputVariable);
+  mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
+  mlir::Value isContiguous =
+      builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
+  mlir::Operation::result_range results =
+      builder
+          .genIfOp(loc, {resultAddrType, builder.getI1Type()}, isContiguous,
+                   /*withElseRegion=*/true)
+          .genThen([&]() {
+            mlir::Value falseVal = builder.create<mlir::arith::ConstantOp>(
+                loc, builder.getI1Type(), builder.getBoolAttr(false));
+            builder.create<fir::ResultOp>(
+                loc, mlir::ValueRange{inputVariable, falseVal});
+          })
+          .genElse([&] {
+            auto [temp, cleanup] =
+                hlfir::createTempFromMold(loc, builder, inputVariable);
+            mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
+            llvm::SmallVector<mlir::Value> extents =
+                hlfir::getIndexExtents(loc, builder, shape);
+            hlfir::LoopNest loopNest = hlfir::genLoopNest(
+                loc, builder, extents, /*isUnordered=*/true,
+                flangomp::shouldUseWorkshareLowering(copyIn));
+            builder.setInsertionPointToStart(loopNest.body);
+            hlfir::Entity elem = hlfir::getElementAt(
+                loc, builder, inputVariable, loopNest.oneBasedIndices);
+            elem = hlfir::loadTrivialScalar(loc, builder, elem);
+            hlfir::Entity tempElem = hlfir::getElementAt(
+                loc, builder, temp, loopNest.oneBasedIndices);
+            builder.create<hlfir::AssignOp>(loc, elem, tempElem);
+            builder.setInsertionPointAfter(loopNest.outerOp);
+
+            mlir::Value result;
+            // Make sure the result is always a boxed array by boxing it
+            // ourselves if need be.
+            if (mlir::isa<fir::BaseBoxType>(temp.getType())) {
+              result = temp;
+            } else {
+              fir::ReferenceType refTy =
+                  fir::ReferenceType::get(temp.getElementOrSequenceType());
+              mlir::Value refVal = builder.createConvert(loc, refTy, temp);
+              result =
+                  builder.create<fir::EmboxOp>(loc, resultAddrType, refVal);
+            }
+
+            builder.create<fir::ResultOp>(loc,
+                                          mlir::ValueRange{result, cleanup});
+          })
+          .getResults();
+
+  mlir::OpResult addr = results[0];
+  mlir::OpResult needsCleanup = results[1];
+
+  builder.setInsertionPoint(copyOut);
+  builder.genIfOp(loc, {}, needsCleanup, /*withElseRegion=*/false).genThen([&] {
+    auto boxAddr = builder.create<fir::BoxAddrOp>(loc, addr);
+    fir::HeapType heapType =
+        fir::HeapType::get(fir::BoxValue(addr).getBaseTy());
+    mlir::Value heapVal =
+        builder.createConvert(loc, heapType, boxAddr.getResult());
+    builder.create<fir::FreeMemOp>(loc, heapVal);
+  });
+  rewriter.eraseOp(copyOut);
+
+  mlir::Value tempBox = copyIn.getTempBox();
+
+  rewriter.replaceOp(copyIn, {addr, builder.genNot(loc, isContiguous)});
+
+  // The TempBox is only needed for flang-rt calls which we're no longer
+  // generating. It should have no uses left at this stage.
+  if (!tempBox.getUses().empty())
+    return mlir::failure();
+  rewriter.eraseOp(tempBox.getDefiningOp());
+
+  return mlir::success();
+}
+
+class InlineHLFIRCopyInPass
+    : public hlfir::impl::InlineHLFIRCopyInBase<InlineHLFIRCopyInPass> {
+public:
+  void runOnOperation() override {
+    mlir::MLIRContext *context = &getContext();
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks.
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
+
+    mlir::RewritePatternSet patterns(context);
+    if (!noInlineHLFIRCopyIn) {
+      patterns.insert<InlineCopyInConversion>(context);
+    }
+
+    if (mlir::failed(mlir::applyPatternsGreedily(
+            getOperation(), std::move(patterns), config))) {
+      mlir::emitError(getOperation()->getLoc(),
+                      "failure in hlfir.copy_in inlining");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 77751908e35be..1779623fddc5a 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -255,6 +255,11 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
         pm, hlfir::createOptimizedBufferization);
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createInlineHLFIRAssign);
+
+    if (optLevel == llvm::OptimizationLevel::O3) {
+      addNestedPassToAllTopLevelOperations<PassConstructor>(
+          pm, hlfir::createInlineHLFIRCopyIn);
+    }
   }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
diff --git a/flang/test/HLFIR/inline-hlfir-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir
index df7681b9c5c16..f834e7971e3d5 100644
--- a/flang/test/HLFIR/inline-hlfir-assign.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign.fir
@@ -353,147 +353,3 @@ func.func @_QPtest_expr_rhs(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.
 // CHECK:           hlfir.destroy %[[VAL_10]] : !hlfir.expr<?x!fir.logical<4>>
 // CHECK:           return
 // CHECK:         }
-
-// Test inlining of hlfir.copy_in that does not require the array to be copied out
-func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
-  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
-  %1 = fir.dummy_scope : !fir.dscope
-  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
-  %5 = fir.load %2#0 : !fir.ref<i32>
-  %6 = fir.convert %5 : (i32) -> i64
-  %c1 = arith.constant 1 : index
-  %c1_0 = arith.constant 1 : index
-  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
-  %c1_1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %8 = arith.subi %7#1, %c1 : index
-  %9 = arith.addi %8, %c1_1 : index
-  %10 = arith.divsi %9, %c1_1 : index
-  %11 = arith.cmpi sgt, %10, %c0 : index
-  %12 = arith.select %11, %10, %c0 : index
-  %13 = fir.load %3#0 : !fir.ref<i32>
-  %14 = fir.convert %13 : (i32) -> i64
-  %15 = fir.shape %12 : (index) -> !fir.shape<1>
-  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
-  %c100_i32 = arith.constant 100 : i32
-  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
-  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-  fir.call @_QFPsb(%18, %19#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-  hlfir.copy_out %0, %17#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1) -> ()
-  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
-  return
-}
-
-// CHECK-LABEL:   func.func private @_test_inline_copy_in(
-// CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
-// CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
-// CHECK-SAME:                                          %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
-// CHECK:    %[[VAL_3:.*]] = arith.constant true
-// CHECK:    %[[VAL_4:.*]] = arith.constant false
-// CHECK:    %[[VAL_5:.*]] = arith.constant 100 : i32
-// CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK:    %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:    %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:    %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
-// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_22:.*]]#0 : !fir.ref<i32>
-// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
-// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_7:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
-// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
-// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
-// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
-// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
-// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
-// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_7:.*]]:%[[VAL_13:.*]]#1:%[[VAL_7:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
-// CHECK:    %[[VAL_20:.*]] = fir.is_contiguous_box %[[VAL_19:.*]] whole : (!fir.box<!fir.array<?xf64>>) -> i1
-// CHECK:    %[[VAL_21:.*]]:2 = fir.if %[[VAL_20:.*]] -> (!fir.box<!fir.array<?xf64>>, i1) {
-// CHECK:      fir.result %[[VAL_19:.*]], %[[VAL_4:.*]] : !fir.box<!fir.array<?xf64>>, i1
-// CHECK:    } else {
-// CHECK:      %[[VAL_24:.*]] = fir.allocmem !fir.array<?xf64>, %[[VAL_15:.*]] {bindc_name = ".tmp", uniq_name = ""}
-// CHECK:      %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_24:.*]](%[[VAL_18:.*]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.heap<!fir.array<?xf64>>)
-// CHECK:      fir.do_loop %arg3 = %[[VAL_7:.*]] to %[[VAL_15:.*]] step %[[VAL_7:.*]] unordered {
-// CHECK:        %[[VAL_26:.*]] = hlfir.designate %[[VAL_19:.*]] (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
-// CHECK:        %[[VAL_27:.*]] = fir.load %[[VAL_26:.*]] : !fir.ref<f64>
-// CHECK:        %[[VAL_28:.*]] = hlfir.designate %[[VAL_25:.*]]#0 (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
-// CHECK:        hlfir.assign %[[VAL_27:.*]] to %[[VAL_28:.*]] : f64, !fir.ref<f64>
-// CHECK:      }
-// CHECK:      fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
-// CHECK:    }
-// CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-// CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-// CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    fir.if %[[VAL_21:.*]]#1 {
-// CHECK:      %[[VAL_24:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-// CHECK:      %[[VAL_25:.*]] = fir.convert %[[VAL_24:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
-// CHECK:      fir.freemem %[[VAL_25:.*]] : !fir.heap<!fir.array<?xf64>>
-// CHECK:    }
-// CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
-// CHECK:    return
-// CHECK:  }
-
-// Test not inlining of hlfir.copy_in that requires the array to be copied out
-func.func private @_test_no_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
-  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
-  %1 = fir.dummy_scope : !fir.dscope
-  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
-  %5 = fir.load %2#0 : !fir.ref<i32>
-  %6 = fir.convert %5 : (i32) -> i64
-  %c1 = arith.constant 1 : index
-  %c1_0 = arith.constant 1 : index
-  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
-  %c1_1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %8 = arith.subi %7#1, %c1 : index
-  %9 = arith.addi %8, %c1_1 : index
-  %10 = arith.divsi %9, %c1_1 : index
-  %11 = arith.cmpi sgt, %10, %c0 : index
-  %12 = arith.select %11, %10, %c0 : index
-  %13 = fir.load %3#0 : !fir.ref<i32>
-  %14 = fir.convert %13 : (i32) -> i64
-  %15 = fir.shape %12 : (index) -> !fir.shape<1>
-  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
-  %c100_i32 = arith.constant 100 : i32
-  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
-  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-  fir.call @_QFPsb(%18, %19#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-  hlfir.copy_out %0, %17#1 to %16 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
-  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
-  return
-}
-
-// CHECK-LABEL:  func.func private @_test_no_inline_copy_in(
-// CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
-// CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
-// CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
-// CHECK:    %[[VAL_3:.*]] = arith.constant 100 : i32
-// CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:    %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:    %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
-// CHECK:    %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:    %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
-// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_8:.*]]#0 : !fir.ref<i32>
-// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
-// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_5:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
-// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
-// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
-// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
-// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
-// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
-// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_5:.*]]:%[[VAL_13:.*]]#1:%[[VAL_5:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
-// CHECK:    %[[VAL_20:.*]]:2 = hlfir.copy_in %[[VAL_19:.*]] to %[[VAL_6:.*]] : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
-// CHECK:    %[[VAL_21:.*]] = fir.box_addr %[[VAL_20:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-// CHECK:    %[[VAL_22:.*]]:3 = hlfir.associate %[[VAL_3:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-// CHECK:    fir.call @_QFPsb(%[[VAL_21:.*]], %[[VAL_22:.*]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    hlfir.copy_out %[[VAL_6:.*]], %[[VAL_20:.*]]#1 to %[[VAL_19:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
-// CHECK:    hlfir.end_associate %[[VAL_22:.*]]#1, %[[VAL_22:.*]]#2 : !fir.ref<i32>, i1
-// CHECK:    return
-// CHECK:  }
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir
new file mode 100644
index 0000000000000..7140e93f19979
--- /dev/null
+++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir
@@ -0,0 +1,146 @@
+// Test inlining of hlfir.copy_in
+// RUN: fir-opt --inline-hlfir-copy-in %s | FileCheck %s
+
+// Test inlining of hlfir.copy_in that does not require the array to be copied out
+func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+  %5 = fir.load %2#0 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> i64
+  %c1 = arith.constant 1 : index
+  %c1_0 = arith.constant 1 : index
+  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+  %c1_1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %8 = arith.subi %7#1, %c1 : index
+  %9 = arith.addi %8, %c1_1 : index
+  %10 = arith.divsi %9, %c1_1 : index
+  %11 = arith.cmpi sgt, %10, %c0 : index
+  %12 = arith.select %11, %10, %c0 : index
+  %13 = fir.load %3#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> i64
+  %15 = fir.shape %12 : (index) -> !fir.shape<1>
+  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+  %c100_i32 = arith.constant 100 : i32
+  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  fir.call @_QFPsb(%18, %19#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+  hlfir.copy_out %0, %17#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1) -> ()
+  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
+  return
+}
+
+// CHECK-LABEL:   func.func private @_test_inline_copy_in(
+// CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
+// CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
+// CHECK-SAME:                                          %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// CHECK:    %[[VAL_3:.*]] = arith.constant true
+// CHECK:    %[[VAL_4:.*]] = arith.constant false
+// CHECK:    %[[VAL_5:.*]] = arith.constant 100 : i32
+// CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:    %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:    %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_8:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_22:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_7:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
+// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_6:.*]] : index
+// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
+// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_7:.*]]:%[[VAL_13:.*]]#1:%[[VAL_7:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:    %[[VAL_20:.*]] = fir.is_contiguous_box %[[VAL_19:.*]] whole : (!fir.box<!fir.array<?xf64>>) -> i1
+// CHECK:    %[[VAL_21:.*]]:2 = fir.if %[[VAL_20:.*]] -> (!fir.box<!fir.array<?xf64>>, i1) {
+// CHECK:      fir.result %[[VAL_19:.*]], %[[VAL_4:.*]] : !fir.box<!fir.array<?xf64>>, i1
+// CHECK:    } else {
+// CHECK:      %[[VAL_24:.*]] = fir.allocmem !fir.array<?xf64>, %[[VAL_15:.*]] {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:      %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_24:.*]](%[[VAL_18:.*]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.heap<!fir.array<?xf64>>)
+// CHECK:      fir.do_loop %arg3 = %[[VAL_7:.*]] to %[[VAL_15:.*]] step %[[VAL_7:.*]] unordered {
+// CHECK:        %[[VAL_26:.*]] = hlfir.designate %[[VAL_19:.*]] (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:        %[[VAL_27:.*]] = fir.load %[[VAL_26:.*]] : !fir.ref<f64>
+// CHECK:        %[[VAL_28:.*]] = hlfir.designate %[[VAL_25:.*]]#0 (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:        hlfir.assign %[[VAL_27:.*]] to %[[VAL_28:.*]] : f64, !fir.ref<f64>
+// CHECK:      }
+// CHECK:      fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
+// CHECK:    }
+// CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+// CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+// CHECK:    fir.if %[[VAL_21:.*]]#1 {
+// CHECK:      %[[VAL_24:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:      %[[VAL_25:.*]] = fir.convert %[[VAL_24:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
+// CHECK:      fir.freemem %[[VAL_25:.*]] : !fir.heap<!fir.array<?xf64>>
+// CHECK:    }
+// CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
+// CHECK:    return
+// CHECK:  }
+
+// Test not inlining of hlfir.copy_in that requires the array to be copied out
+func.func private @_test_no_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2:2 = hlfir.declare %arg1 dummy_scope %1 {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3:2 = hlfir.declare %arg2 dummy_scope %1 {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg0 dummy_scope %1 {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+  %5 = fir.load %2#0 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> i64
+  %c1 = arith.constant 1 : index
+  %c1_0 = arith.constant 1 : index
+  %7:3 = fir.box_dims %4#1, %c1_0 : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+  %c1_1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %8 = arith.subi %7#1, %c1 : index
+  %9 = arith.addi %8, %c1_1 : index
+  %10 = arith.divsi %9, %c1_1 : index
+  %11 = arith.cmpi sgt, %10, %c0 : index
+  %12 = arith.select %11, %10, %c0 : index
+  %13 = fir.load %3#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> i64
+  %15 = fir.shape %12 : (index) -> !fir.shape<1>
+  %16 = hlfir.designate %4#0 (%6, %c1:%7#1:%c1_1, %14)  shape %15 : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+  %c100_i32 = arith.constant 100 : i32
+  %17:2 = hlfir.copy_in %16 to %0 : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+  %18 = fir.box_addr %17#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+  %19:3 = hlfir.associate %c100_i32 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  fir.call @_QFPsb(%18, %19#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+  hlfir.copy_out %0, %17#1 to %16 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
+  hlfir.end_associate %19#1, %19#2 : !fir.ref<i32>, i1
+  return
+}
+
+// CHECK-LABEL:  func.func private @_test_no_inline_copy_in(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"},
+// CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
+// CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}) {
+// CHECK:    %[[VAL_3:.*]] = arith.constant 100 : i32
+// CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:    %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>>
+// CHECK:    %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:    %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:    %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %[[VAL_7:.*]] {uniq_name = "_QFFsb2Ex"} : (!fir.box<!fir.array<?x?x?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf64>>, !fir.box<!fir.array<?x?x?xf64>>)
+// CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_8:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_11:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_10:.*]]#1, %[[VAL_5:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, index) -> (index, index, index)
+// CHECK:    %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
+// CHECK:    %[[VAL_15:.*]] = arith.select %[[VAL_14:.*]], %[[VAL_13:.*]]#1, %[[VAL_4:.*]] : index
+// CHECK:    %[[VAL_16:.*]] = fir.load %[[VAL_9:.*]]#0 : !fir.ref<i32>
+// CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16:.*]] : (i32) -> i64
+// CHECK:    %[[VAL_18:.*]] = fir.shape %[[VAL_15:.*]] : (index) -> !fir.shape<1>
+// CHECK:    %[[VAL_19:.*]] = hlfir.designate %[[VAL_10:.*]]#0 (%[[VAL_12:.*]], %[[VAL_5:.*]]:%[[VAL_13:.*]]#1:%[[VAL_5:.*]], %[[VAL_17:.*]])  shape %[[VAL_18:.*]] : (!fir.box<!fir.array<?x?x?xf64>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:    %[[VAL_20:.*]]:2 = hlfir.copy_in %[[VAL_19:.*]] to %[[VAL_6:.*]] : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
+// CHECK:    %[[VAL_21:.*]] = fir.box_addr %[[VAL_20:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:    %[[VAL_22:.*]]:3 = hlfir.associate %[[VAL_3:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+// CHECK:    fir.call @_QFPsb(%[[VAL_21:.*]], %[[VAL_22:.*]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
+// CHECK:    hlfir.copy_out %[[VAL_6:.*]], %[[VAL_20:.*]]#1 to %[[VAL_19:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
+// CHECK:    hlfir.end_associate %[[VAL_22:.*]]#1, %[[VAL_22:.*]]#2 : !fir.ref<i32>, i1
+// CHECK:    return
+// CHECK:  }

>From bed8a6af87cff1a404255c92da06d8e7ca07e908 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Wed, 28 May 2025 13:44:53 +0000
Subject: [PATCH 5/5] Support arrays behind a pointer, add metadata to disable
 vectorizing

---
 .../flang/Optimizer/Builder/HLFIRTools.h      |  8 ++-
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    | 13 +++-
 .../HLFIR/Transforms/InlineHLFIRCopyIn.cpp    | 66 ++++++++++---------
 flang/test/HLFIR/inline-hlfir-copy-in.fir     |  6 +-
 4 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index ed00cec04dc39..2cbad6e268a38 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -374,12 +374,14 @@ struct LoopNest {
 /// loop constructs currently.
 LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
                      mlir::ValueRange extents, bool isUnordered = false,
-                     bool emitWorkshareLoop = false);
+                     bool emitWorkshareLoop = false,
+                     bool couldVectorize = true);
 inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
                             mlir::Value shape, bool isUnordered = false,
-                            bool emitWorkshareLoop = false) {
+                            bool emitWorkshareLoop = false,
+                            bool couldVectorize = true) {
   return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape),
-                     isUnordered, emitWorkshareLoop);
+                     isUnordered, emitWorkshareLoop, couldVectorize);
 }
 
 /// The type of a callback that generates the body of a reduction
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index f24dc2caeedfc..14aae5d7118a1 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
 #include <mlir/Dialect/OpenMP/OpenMPDialect.h>
 #include <optional>
 
@@ -932,7 +933,8 @@ mlir::Value hlfir::inlineElementalOp(
 hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
                                    fir::FirOpBuilder &builder,
                                    mlir::ValueRange extents, bool isUnordered,
-                                   bool emitWorkshareLoop) {
+                                   bool emitWorkshareLoop,
+                                   bool couldVectorize) {
   emitWorkshareLoop = emitWorkshareLoop && isUnordered;
   hlfir::LoopNest loopNest;
   assert(!extents.empty() && "must have at least one extent");
@@ -967,6 +969,15 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
       auto ub = builder.createConvert(loc, indexType, extent);
       auto doLoop =
           builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered);
+      if (!couldVectorize) {
+        mlir::LLVM::LoopVectorizeAttr va{mlir::LLVM::LoopVectorizeAttr::get(
+            builder.getContext(),
+            /*disable=*/builder.getBoolAttr(true), {}, {}, {}, {}, {}, {})};
+        mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
+            builder.getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ {},
+            /*unroll_and_jam*/ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
+        doLoop.setLoopAnnotationAttr(la);
+      }
       loopNest.body = doLoop.getBody();
       builder.setInsertionPointToStart(loopNest.body);
       // Reverse the indices so they are in column-major order.
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
index 1e2aecaf535a0..d1cbe3241c07b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
@@ -52,19 +52,15 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
     return rewriter.notifyMatchFailure(copyIn,
                                        "CopyInOp's data type is not trivial");
 
-  if (fir::isPointerType(inputVariable.getType()))
-    return rewriter.notifyMatchFailure(
-        copyIn, "CopyInOp's input variable is a pointer");
-
   // There should be exactly one user of WasCopied - the corresponding
   // CopyOutOp.
-  if (copyIn.getWasCopied().getUses().empty())
-    return rewriter.notifyMatchFailure(copyIn,
-                                       "CopyInOp's WasCopied has no uses");
+  if (!copyIn.getWasCopied().hasOneUse())
+    return rewriter.notifyMatchFailure(
+        copyIn, "CopyInOp's WasCopied has no single user");
   // The copy out should always be present, either to actually copy or just
   // deallocate memory.
   auto copyOut = mlir::dyn_cast<hlfir::CopyOutOp>(
-      copyIn.getWasCopied().getUsers().begin().getCurrent().getUser());
+      copyIn.getWasCopied().user_begin().getCurrent().getUser());
 
   if (!copyOut)
     return rewriter.notifyMatchFailure(copyIn,
@@ -77,28 +73,45 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
 
   inputVariable =
       hlfir::derefPointersAndAllocatables(loc, builder, inputVariable);
-  mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
+  mlir::Type sequenceType =
+      hlfir::getFortranElementOrSequenceType(inputVariable.getType());
+  fir::BoxType resultBoxType = fir::BoxType::get(sequenceType);
   mlir::Value isContiguous =
       builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
   mlir::Operation::result_range results =
       builder
-          .genIfOp(loc, {resultAddrType, builder.getI1Type()}, isContiguous,
+          .genIfOp(loc, {resultBoxType, builder.getI1Type()}, isContiguous,
                    /*withElseRegion=*/true)
           .genThen([&]() {
-            mlir::Value falseVal = builder.create<mlir::arith::ConstantOp>(
-                loc, builder.getI1Type(), builder.getBoolAttr(false));
+            mlir::Value result = inputVariable;
+            if (fir::isPointerType(inputVariable.getType())) {
+              auto boxAddr = builder.create<fir::BoxAddrOp>(loc, inputVariable);
+              fir::ReferenceType refTy = fir::ReferenceType::get(sequenceType);
+              mlir::Value refVal = builder.createConvert(loc, refTy, boxAddr);
+              mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
+              result = builder.create<fir::EmboxOp>(loc, resultBoxType, refVal,
+                                                    shape);
+            }
             builder.create<fir::ResultOp>(
-                loc, mlir::ValueRange{inputVariable, falseVal});
+                loc, mlir::ValueRange{result, builder.createBool(loc, false)});
           })
           .genElse([&] {
-            auto [temp, cleanup] =
-                hlfir::createTempFromMold(loc, builder, inputVariable);
             mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
             llvm::SmallVector<mlir::Value> extents =
                 hlfir::getIndexExtents(loc, builder, shape);
-            hlfir::LoopNest loopNest = hlfir::genLoopNest(
-                loc, builder, extents, /*isUnordered=*/true,
-                flangomp::shouldUseWorkshareLowering(copyIn));
+            llvm::StringRef tmpName{".tmp.copy_in"};
+            llvm::SmallVector<mlir::Value> lenParams;
+            mlir::Value alloc = builder.createHeapTemporary(
+                loc, sequenceType, tmpName, extents, lenParams);
+
+            auto declareOp = builder.create<hlfir::DeclareOp>(
+                loc, alloc, tmpName, shape, lenParams,
+                /*dummy_scope=*/nullptr);
+            hlfir::Entity temp{declareOp.getBase()};
+            hlfir::LoopNest loopNest =
+                hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+                                   flangomp::shouldUseWorkshareLowering(copyIn),
+                                   /*couldVectorize=*/false);
             builder.setInsertionPointToStart(loopNest.body);
             hlfir::Entity elem = hlfir::getElementAt(
                 loc, builder, inputVariable, loopNest.oneBasedIndices);
@@ -117,12 +130,12 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
               fir::ReferenceType refTy =
                   fir::ReferenceType::get(temp.getElementOrSequenceType());
               mlir::Value refVal = builder.createConvert(loc, refTy, temp);
-              result =
-                  builder.create<fir::EmboxOp>(loc, resultAddrType, refVal);
+              result = builder.create<fir::EmboxOp>(loc, resultBoxType, refVal,
+                                                    shape);
             }
 
-            builder.create<fir::ResultOp>(loc,
-                                          mlir::ValueRange{result, cleanup});
+            builder.create<fir::ResultOp>(
+                loc, mlir::ValueRange{result, builder.createBool(loc, true)});
           })
           .getResults();
 
@@ -140,16 +153,7 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
   });
   rewriter.eraseOp(copyOut);
 
-  mlir::Value tempBox = copyIn.getTempBox();
-
   rewriter.replaceOp(copyIn, {addr, builder.genNot(loc, isContiguous)});
-
-  // The TempBox is only needed for flang-rt calls which we're no longer
-  // generating. It should have no uses left at this stage.
-  if (!tempBox.getUses().empty())
-    return mlir::failure();
-  rewriter.eraseOp(tempBox.getDefiningOp());
-
   return mlir::success();
 }
 
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir
index 7140e93f19979..7a5b6e591f7c7 100644
--- a/flang/test/HLFIR/inline-hlfir-copy-in.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir
@@ -60,9 +60,9 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
 // CHECK:    %[[VAL_21:.*]]:2 = fir.if %[[VAL_20:.*]] -> (!fir.box<!fir.array<?xf64>>, i1) {
 // CHECK:      fir.result %[[VAL_19:.*]], %[[VAL_4:.*]] : !fir.box<!fir.array<?xf64>>, i1
 // CHECK:    } else {
-// CHECK:      %[[VAL_24:.*]] = fir.allocmem !fir.array<?xf64>, %[[VAL_15:.*]] {bindc_name = ".tmp", uniq_name = ""}
-// CHECK:      %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_24:.*]](%[[VAL_18:.*]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.heap<!fir.array<?xf64>>)
-// CHECK:      fir.do_loop %arg3 = %[[VAL_7:.*]] to %[[VAL_15:.*]] step %[[VAL_7:.*]] unordered {
+// CHECK:      %[[VAL_24:.*]] = fir.allocmem !fir.array<?xf64>, %[[VAL_15:.*]] {bindc_name = ".tmp.copy_in", uniq_name = ""}
+// CHECK:      %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_24:.*]](%[[VAL_18:.*]]) {uniq_name = ".tmp.copy_in"} : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.heap<!fir.array<?xf64>>)
+// CHECK:      fir.do_loop %arg3 = %[[VAL_7:.*]] to %[[VAL_15:.*]] step %[[VAL_7:.*]] unordered attributes {loopAnnotation = #loop_annotation} {
 // CHECK:        %[[VAL_26:.*]] = hlfir.designate %[[VAL_19:.*]] (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
 // CHECK:        %[[VAL_27:.*]] = fir.load %[[VAL_26:.*]] : !fir.ref<f64>
 // CHECK:        %[[VAL_28:.*]] = hlfir.designate %[[VAL_25:.*]]#0 (%arg3)  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>



More information about the flang-commits mailing list