[flang-commits] [flang] [flang][acc] Fix cache directive with mapped component (PR #179335)

Tue Feb 3 01:52:08 PST 2026

https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/179335

>From c1badede8600647965da4b35ddc3df0d91ae67d3 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Sat, 31 Jan 2026 16:17:56 -0800
Subject: [PATCH 1/5] [flang][hlfir] Rename InlineHLFIRCopyIn to
 InlineHLFIRCopy and inline copy_out

Rename the pass to InlineHLFIRCopy since it now handles both copy_in and
copy_out operations. When a copy_in is inlined, the corresponding
copy_out (deallocation-only case) is also inlined with direct fir.freemem
instead of being left for later lowering to a runtime call.
---
 flang/include/flang/Optimizer/HLFIR/Passes.td |  4 +-
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |  2 +-
 ...ineHLFIRCopyIn.cpp => InlineHLFIRCopy.cpp} | 68 ++++++++++++++-----
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  2 +-
 ...lfir-copy-in.fir => inline-hlfir-copy.fir} | 13 +++-
 5 files changed, 66 insertions(+), 23 deletions(-)
 rename flang/lib/Optimizer/HLFIR/Transforms/{InlineHLFIRCopyIn.cpp => InlineHLFIRCopy.cpp} (73%)
 rename flang/test/HLFIR/{inline-hlfir-copy-in.fir => inline-hlfir-copy.fir} (95%)

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index bfff458f7a6c5..88b89de956366 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -73,8 +73,8 @@ def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
   let summary = "Inline hlfir.assign operations";
 }
 
-def InlineHLFIRCopyIn : Pass<"inline-hlfir-copy-in"> {
-  let summary = "Inline hlfir.copy_in operations";
+def InlineHLFIRCopy : Pass<"inline-hlfir-copy"> {
+  let summary = "Inline hlfir.copy_in and hlfir.copy_out operations";
 }
 
 def PropagateFortranVariableAttributes : Pass<"propagate-fortran-attrs"> {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 5c24fe58b05c4..2005eb9770d30 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ add_flang_library(HLFIRTransforms
   ExpressionSimplification.cpp
   InlineElementals.cpp
   InlineHLFIRAssign.cpp
-  InlineHLFIRCopyIn.cpp
+  InlineHLFIRCopy.cpp
   LowerHLFIRIntrinsics.cpp
   LowerHLFIROrderedAssignments.cpp
   ScheduleOrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
similarity index 73%
rename from flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
rename to flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
index b4e89b0966e9c..7dc0325a711de 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
@@ -1,4 +1,4 @@
-//===- InlineHLFIRCopyIn.cpp - Inline hlfir.copy_in ops -------------------===//
+//===- InlineHLFIRCopy.cpp - Inline hlfir.copy_in/copy_out ops ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,6 +9,9 @@
 // per element assignments. For simplicity, the inlining is done for trivial
 // data types when the copy_in does not require a corresponding copy_out and
 // when the input array is not behind a pointer. This may change in the future.
+//
+// When the copy_in is inlined, the corresponding copy_out (deallocation-only
+// case, i.e., when var is null) is also inlined with direct fir.freemem.
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -21,15 +24,15 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace hlfir {
-#define GEN_PASS_DEF_INLINEHLFIRCOPYIN
+#define GEN_PASS_DEF_INLINEHLFIRCOPY
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
-#define DEBUG_TYPE "inline-hlfir-copy-in"
+#define DEBUG_TYPE "inline-hlfir-copy"
 
-static llvm::cl::opt<bool> noInlineHLFIRCopyIn(
-    "no-inline-hlfir-copy-in",
-    llvm::cl::desc("Do not inline hlfir.copy_in operations"),
+static llvm::cl::opt<bool> noInlineHLFIRCopy(
+    "no-inline-hlfir-copy",
+    llvm::cl::desc("Do not inline hlfir.copy_in/copy_out operations"),
     llvm::cl::init(false));
 
 namespace {
@@ -42,6 +45,31 @@ class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
                   mlir::PatternRewriter &rewriter) const override;
 };
 
+// Helper function to inline a copy_out deallocation (no copy-back case).
+// Generates: if (wasCopied) { freemem(box_addr(load(tempBox))) }
+static void inlineCopyOutDeallocation(fir::FirOpBuilder &builder,
+                                      mlir::Location loc,
+                                      mlir::Value tempBox,
+                                      mlir::Value wasCopied,
+                                      mlir::Type sequenceType) {
+  builder.genIfOp(loc, {}, wasCopied, /*withElseRegion=*/false)
+      .genThen([&]() {
+        mlir::Value box = fir::LoadOp::create(builder, loc, tempBox);
+        mlir::Value addr = fir::BoxAddrOp::create(builder, loc, box);
+        auto heapType = fir::HeapType::get(sequenceType);
+        mlir::Value heapAddr =
+            fir::ConvertOp::create(builder, loc, heapType, addr);
+        fir::FreeMemOp::create(builder, loc, heapAddr);
+      });
+}
+
+// Note: We don't have a separate InlineCopyOutConversion pattern.
+// Copy_out inlining is handled by InlineCopyInConversion when it inlines
+// the paired copy_in. For copy_outs that aren't paired with an eligible
+// copy_in (e.g., optional args, assumed-rank, non-trivial types), the
+// copy_out is left as-is and will be lowered to a runtime call.
+// This is the conservative approach for the upstream pass.
+
 llvm::LogicalResult
 InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
                                         mlir::PatternRewriter &rewriter) const {
@@ -148,20 +176,28 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
   mlir::OpResult resultBox = results[0];
   mlir::OpResult needsCleanup = results[1];
 
-  // Prepare the corresponding copyOut to free the temporary if it is required
+  // Inline the corresponding copyOut to free the temporary if it is required.
+  // Generate: if (needsCleanup) { freemem(box_addr(resultBox)) }
+  // We need to store the resultBox first since it's a box value, then generate
+  // the deallocation code at the copyOut location.
   auto alloca = fir::AllocaOp::create(builder, loc, resultBox.getType());
-  auto store = fir::StoreOp::create(builder, loc, resultBox, alloca);
-  rewriter.startOpModification(copyOut);
-  copyOut->setOperand(0, store.getMemref());
-  copyOut->setOperand(1, needsCleanup);
-  rewriter.finalizeOpModification(copyOut);
+  fir::StoreOp::create(builder, loc, resultBox, alloca);
+
+  // Move to the copyOut location to generate the deallocation
+  rewriter.setInsertionPoint(copyOut);
+  fir::FirOpBuilder copyOutBuilder(rewriter, copyOut.getOperation());
+  inlineCopyOutDeallocation(copyOutBuilder, copyOut.getLoc(), alloca,
+                            needsCleanup, sequenceType);
+
+  // Erase the copyOut since we've inlined it
+  rewriter.eraseOp(copyOut);
 
   rewriter.replaceOp(copyIn, {resultBox, builder.genNot(loc, isContiguous)});
   return mlir::success();
 }
 
-class InlineHLFIRCopyInPass
-    : public hlfir::impl::InlineHLFIRCopyInBase<InlineHLFIRCopyInPass> {
+class InlineHLFIRCopyPass
+    : public hlfir::impl::InlineHLFIRCopyBase<InlineHLFIRCopyPass> {
 public:
   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();
@@ -172,14 +208,14 @@ class InlineHLFIRCopyInPass
         mlir::GreedySimplifyRegionLevel::Disabled);
 
     mlir::RewritePatternSet patterns(context);
-    if (!noInlineHLFIRCopyIn) {
+    if (!noInlineHLFIRCopy) {
       patterns.insert<InlineCopyInConversion>(context);
     }
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       mlir::emitError(getOperation()->getLoc(),
-                      "failure in hlfir.copy_in inlining");
+                      "failure in hlfir.copy_in/copy_out inlining");
       signalPassFailure();
     }
   }
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6054675643c64..130f129511c45 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -281,7 +281,7 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
 
     if (optLevel == llvm::OptimizationLevel::O3) {
       addNestedPassToAllTopLevelOperations<PassConstructor>(
-          pm, hlfir::createInlineHLFIRCopyIn);
+          pm, hlfir::createInlineHLFIRCopy);
     }
   }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy.fir
similarity index 95%
rename from flang/test/HLFIR/inline-hlfir-copy-in.fir
rename to flang/test/HLFIR/inline-hlfir-copy.fir
index f1da1da9f9a5c..cd8cc0a87e19d 100644
--- a/flang/test/HLFIR/inline-hlfir-copy-in.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy.fir
@@ -1,5 +1,5 @@
-// Test inlining of hlfir.copy_in
-// RUN: fir-opt --inline-hlfir-copy-in %s | FileCheck %s
+// Test inlining of hlfir.copy_in and hlfir.copy_out
+// RUN: fir-opt --inline-hlfir-copy %s | FileCheck %s
 
 // Test inlining of hlfir.copy_in that does not require the array to be copied out
 func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
@@ -72,10 +72,17 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
 // CHECK:      }
 // CHECK:      fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
 // CHECK:    }
+// CHECK:    %[[VAL_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+// CHECK:    fir.store %[[VAL_21:.*]]#0 to %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
 // CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
 // CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 // CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
+// CHECK:    fir.if %[[VAL_21:.*]]#1 {
+// CHECK:      %[[VAL_BOX:.*]] = fir.load %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+// CHECK:      %[[VAL_ADDR:.*]] = fir.box_addr %[[VAL_BOX:.*]] : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
+// CHECK:      %[[VAL_HEAP:.*]] = fir.convert %[[VAL_ADDR:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
+// CHECK:      fir.freemem %[[VAL_HEAP:.*]] : !fir.heap<!fir.array<?xf64>>
+// CHECK:    }
 // CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
 // CHECK:    return
 // CHECK:  }

>From 9bbc0dd587bf831d583e953fbe0bdfa5ec5755f5 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Mon, 2 Feb 2026 13:22:33 -0800
Subject: [PATCH 2/5] [flang][OpenACC] Fix cache directive with mapped
 component

When a derived type component is mapped via copyin (e.g.,
copyin(data%A(...))), the base address inside the parallel region
comes from the mapped address, not from an hlfir.designate op.
Handle this case by conditionally extracting shape/typeparams/attrs
only when the base is a DesignateOp.
---
 flang/lib/Lower/OpenACC.cpp            | 22 ++++++++++++++--------
 flang/test/Lower/OpenACC/acc-cache.f90 | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 183f9e717532a..df7970d4243fa 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4304,20 +4304,26 @@ genACC(Fortran::lower::AbstractConverter &converter,
           fir::substBase(hostExv, cacheOp.getAccVar());
       converter.bindSymbol(symbol, cacheExv);
     } else {
-      // Must be a derived type component reference.
+      // Derived type component reference.
       assert(designator && "expected designator for non-symbol cache operand");
       std::optional<Fortran::evaluate::Component> componentRef =
           extractComponentFromDesignator(designator);
       assert(componentRef &&
              "expected component reference for derived type cache operand");
-      // Component references are lowered to designate operations.
-      auto designate = base.getDefiningOp<hlfir::DesignateOp>();
-      assert(designate && "expected designate op for component reference");
+      // When component is mapped via copyin, base is the mapped address.
+      mlir::Value shape;
+      llvm::SmallVector<mlir::Value> lenParams;
+      fir::FortranVariableFlagsAttr attrs;
+      if (auto designate = base.getDefiningOp<hlfir::DesignateOp>()) {
+        shape = designate.getShape();
+        lenParams = llvm::SmallVector<mlir::Value>(
+            designate.getTypeparams().begin(), designate.getTypeparams().end());
+        attrs = designate.getFortranAttrsAttr();
+      }
       auto declareOp = hlfir::DeclareOp::create(
-          builder, operandLocation, cacheOp.getAccVar(), asFortran.str(),
-          designate.getShape(), designate.getTypeparams(),
-          /*dummyScope=*/nullptr, /*storage=*/nullptr,
-          /*storageOffset=*/0, designate.getFortranAttrsAttr());
+          builder, operandLocation, cacheOp.getAccVar(), asFortran.str(), shape,
+          lenParams, /*dummyScope=*/nullptr, /*storage=*/nullptr,
+          /*storageOffset=*/0, attrs);
       converter.getSymbolMap().addComponentOverride(*componentRef, declareOp);
     }
   }
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 22dd0a84aee8a..bcc0391a3e5e0 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -638,6 +638,31 @@ subroutine test_cache_nested_derived_type()
 ! CHECK: acc.yield
 end subroutine
 
+! Test cache with allocatable component in combined construct
+! CHECK-LABEL: func.func @_QPtest_cache_combined_allocatable(
+subroutine test_cache_combined_allocatable(data, C, M)
+  type :: dt
+    real, dimension(:), allocatable :: A
+  end type
+
+  type(dt), intent(inout) :: data
+  real, dimension(:), intent(out) :: C
+  integer, intent(in) :: M
+  integer :: i
+
+  !$acc parallel loop gang vector copyin(data, data%A(-3:M+4)) copyout(C(1:M))
+  do i = 1, M
+    !$acc cache(data%A(i-4:i+4))
+    C(i) = data%A(i)
+  end do
+
+! CHECK: acc.parallel {{.*}} {
+! CHECK: acc.loop
+! CHECK: acc.cache varPtr(%{{.*}}) bounds(%{{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "data%a(i-4_4:i+4_4)", structured = false}
+! CHECK: hlfir.declare %{{.*}} {uniq_name = "data%a(i-4_4:i+4_4)"}
+! CHECK: acc.yield
+end subroutine
+
 ! Test cache with temporary in designator bounds - verifies local statement context
 ! doesn't cause issues with temporary cleanup
 ! CHECK-LABEL: func.func @_QPtest_cache_temp_in_designator(

>From b2cbeb0b6f2a0f599c1af4bd3d327d52e1f33afc Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Mon, 2 Feb 2026 13:28:46 -0800
Subject: [PATCH 3/5] Revert "[flang][hlfir] Rename InlineHLFIRCopyIn to
 InlineHLFIRCopy and inline copy_out"

This reverts commit c1badede8600647965da4b35ddc3df0d91ae67d3.
---
 flang/include/flang/Optimizer/HLFIR/Passes.td |  4 +-
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |  2 +-
 ...ineHLFIRCopy.cpp => InlineHLFIRCopyIn.cpp} | 68 +++++--------------
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  2 +-
 ...lfir-copy.fir => inline-hlfir-copy-in.fir} | 13 +---
 5 files changed, 23 insertions(+), 66 deletions(-)
 rename flang/lib/Optimizer/HLFIR/Transforms/{InlineHLFIRCopy.cpp => InlineHLFIRCopyIn.cpp} (73%)
 rename flang/test/HLFIR/{inline-hlfir-copy.fir => inline-hlfir-copy-in.fir} (95%)

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 88b89de956366..bfff458f7a6c5 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -73,8 +73,8 @@ def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
   let summary = "Inline hlfir.assign operations";
 }
 
-def InlineHLFIRCopy : Pass<"inline-hlfir-copy"> {
-  let summary = "Inline hlfir.copy_in and hlfir.copy_out operations";
+def InlineHLFIRCopyIn : Pass<"inline-hlfir-copy-in"> {
+  let summary = "Inline hlfir.copy_in operations";
 }
 
 def PropagateFortranVariableAttributes : Pass<"propagate-fortran-attrs"> {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 2005eb9770d30..5c24fe58b05c4 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ add_flang_library(HLFIRTransforms
   ExpressionSimplification.cpp
   InlineElementals.cpp
   InlineHLFIRAssign.cpp
-  InlineHLFIRCopy.cpp
+  InlineHLFIRCopyIn.cpp
   LowerHLFIRIntrinsics.cpp
   LowerHLFIROrderedAssignments.cpp
   ScheduleOrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
similarity index 73%
rename from flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
rename to flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
index 7dc0325a711de..b4e89b0966e9c 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopy.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRCopyIn.cpp
@@ -1,4 +1,4 @@
-//===- InlineHLFIRCopy.cpp - Inline hlfir.copy_in/copy_out ops ------------===//
+//===- InlineHLFIRCopyIn.cpp - Inline hlfir.copy_in ops -------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,9 +9,6 @@
 // per element assignments. For simplicity, the inlining is done for trivial
 // data types when the copy_in does not require a corresponding copy_out and
 // when the input array is not behind a pointer. This may change in the future.
-//
-// When the copy_in is inlined, the corresponding copy_out (deallocation-only
-// case, i.e., when var is null) is also inlined with direct fir.freemem.
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -24,15 +21,15 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace hlfir {
-#define GEN_PASS_DEF_INLINEHLFIRCOPY
+#define GEN_PASS_DEF_INLINEHLFIRCOPYIN
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
-#define DEBUG_TYPE "inline-hlfir-copy"
+#define DEBUG_TYPE "inline-hlfir-copy-in"
 
-static llvm::cl::opt<bool> noInlineHLFIRCopy(
-    "no-inline-hlfir-copy",
-    llvm::cl::desc("Do not inline hlfir.copy_in/copy_out operations"),
+static llvm::cl::opt<bool> noInlineHLFIRCopyIn(
+    "no-inline-hlfir-copy-in",
+    llvm::cl::desc("Do not inline hlfir.copy_in operations"),
     llvm::cl::init(false));
 
 namespace {
@@ -45,31 +42,6 @@ class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
                   mlir::PatternRewriter &rewriter) const override;
 };
 
-// Helper function to inline a copy_out deallocation (no copy-back case).
-// Generates: if (wasCopied) { freemem(box_addr(load(tempBox))) }
-static void inlineCopyOutDeallocation(fir::FirOpBuilder &builder,
-                                      mlir::Location loc,
-                                      mlir::Value tempBox,
-                                      mlir::Value wasCopied,
-                                      mlir::Type sequenceType) {
-  builder.genIfOp(loc, {}, wasCopied, /*withElseRegion=*/false)
-      .genThen([&]() {
-        mlir::Value box = fir::LoadOp::create(builder, loc, tempBox);
-        mlir::Value addr = fir::BoxAddrOp::create(builder, loc, box);
-        auto heapType = fir::HeapType::get(sequenceType);
-        mlir::Value heapAddr =
-            fir::ConvertOp::create(builder, loc, heapType, addr);
-        fir::FreeMemOp::create(builder, loc, heapAddr);
-      });
-}
-
-// Note: We don't have a separate InlineCopyOutConversion pattern.
-// Copy_out inlining is handled by InlineCopyInConversion when it inlines
-// the paired copy_in. For copy_outs that aren't paired with an eligible
-// copy_in (e.g., optional args, assumed-rank, non-trivial types), the
-// copy_out is left as-is and will be lowered to a runtime call.
-// This is the conservative approach for the upstream pass.
-
 llvm::LogicalResult
 InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
                                         mlir::PatternRewriter &rewriter) const {
@@ -176,28 +148,20 @@ InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
   mlir::OpResult resultBox = results[0];
   mlir::OpResult needsCleanup = results[1];
 
-  // Inline the corresponding copyOut to free the temporary if it is required.
-  // Generate: if (needsCleanup) { freemem(box_addr(resultBox)) }
-  // We need to store the resultBox first since it's a box value, then generate
-  // the deallocation code at the copyOut location.
+  // Prepare the corresponding copyOut to free the temporary if it is required
   auto alloca = fir::AllocaOp::create(builder, loc, resultBox.getType());
-  fir::StoreOp::create(builder, loc, resultBox, alloca);
-
-  // Move to the copyOut location to generate the deallocation
-  rewriter.setInsertionPoint(copyOut);
-  fir::FirOpBuilder copyOutBuilder(rewriter, copyOut.getOperation());
-  inlineCopyOutDeallocation(copyOutBuilder, copyOut.getLoc(), alloca,
-                            needsCleanup, sequenceType);
-
-  // Erase the copyOut since we've inlined it
-  rewriter.eraseOp(copyOut);
+  auto store = fir::StoreOp::create(builder, loc, resultBox, alloca);
+  rewriter.startOpModification(copyOut);
+  copyOut->setOperand(0, store.getMemref());
+  copyOut->setOperand(1, needsCleanup);
+  rewriter.finalizeOpModification(copyOut);
 
   rewriter.replaceOp(copyIn, {resultBox, builder.genNot(loc, isContiguous)});
   return mlir::success();
 }
 
-class InlineHLFIRCopyPass
-    : public hlfir::impl::InlineHLFIRCopyBase<InlineHLFIRCopyPass> {
+class InlineHLFIRCopyInPass
+    : public hlfir::impl::InlineHLFIRCopyInBase<InlineHLFIRCopyInPass> {
 public:
   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();
@@ -208,14 +172,14 @@ class InlineHLFIRCopyPass
         mlir::GreedySimplifyRegionLevel::Disabled);
 
     mlir::RewritePatternSet patterns(context);
-    if (!noInlineHLFIRCopy) {
+    if (!noInlineHLFIRCopyIn) {
       patterns.insert<InlineCopyInConversion>(context);
     }
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       mlir::emitError(getOperation()->getLoc(),
-                      "failure in hlfir.copy_in/copy_out inlining");
+                      "failure in hlfir.copy_in inlining");
       signalPassFailure();
     }
   }
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 130f129511c45..6054675643c64 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -281,7 +281,7 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
 
     if (optLevel == llvm::OptimizationLevel::O3) {
       addNestedPassToAllTopLevelOperations<PassConstructor>(
-          pm, hlfir::createInlineHLFIRCopy);
+          pm, hlfir::createInlineHLFIRCopyIn);
     }
   }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
diff --git a/flang/test/HLFIR/inline-hlfir-copy.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir
similarity index 95%
rename from flang/test/HLFIR/inline-hlfir-copy.fir
rename to flang/test/HLFIR/inline-hlfir-copy-in.fir
index cd8cc0a87e19d..f1da1da9f9a5c 100644
--- a/flang/test/HLFIR/inline-hlfir-copy.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir
@@ -1,5 +1,5 @@
-// Test inlining of hlfir.copy_in and hlfir.copy_out
-// RUN: fir-opt --inline-hlfir-copy %s | FileCheck %s
+// Test inlining of hlfir.copy_in
+// RUN: fir-opt --inline-hlfir-copy-in %s | FileCheck %s
 
 // Test inlining of hlfir.copy_in that does not require the array to be copied out
 func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i"}, %arg2: !fir.ref<i32> {fir.bindc_name = "j"}) {
@@ -72,17 +72,10 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
 // CHECK:      }
 // CHECK:      fir.result %[[VAL_25:.*]]#0, %[[VAL_3:.*]] : !fir.box<!fir.array<?xf64>>, i1
 // CHECK:    }
-// CHECK:    %[[VAL_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
-// CHECK:    fir.store %[[VAL_21:.*]]#0 to %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
 // CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
 // CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 // CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    fir.if %[[VAL_21:.*]]#1 {
-// CHECK:      %[[VAL_BOX:.*]] = fir.load %[[VAL_ALLOCA:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
-// CHECK:      %[[VAL_ADDR:.*]] = fir.box_addr %[[VAL_BOX:.*]] : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
-// CHECK:      %[[VAL_HEAP:.*]] = fir.convert %[[VAL_ADDR:.*]] : (!fir.ref<!fir.array<?xf64>>) -> !fir.heap<!fir.array<?xf64>>
-// CHECK:      fir.freemem %[[VAL_HEAP:.*]] : !fir.heap<!fir.array<?xf64>>
-// CHECK:    }
+// CHECK:    hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
 // CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
 // CHECK:    return
 // CHECK:  }

>From 3f249b601754a332ecfe1798a2df7e199f2d9b76 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Mon, 2 Feb 2026 13:36:56 -0800
Subject: [PATCH 4/5] [flang][OpenACC] Add more cache directive test cases

Add test cases for:
- cache with copy of whole struct (not explicit component copyin)
- cache with nested derived type in parallel loop with copyin
---
 flang/test/Lower/OpenACC/acc-cache.f90 | 56 +++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index bcc0391a3e5e0..bcbd6fa339904 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -659,7 +659,61 @@ subroutine test_cache_combined_allocatable(data, C, M)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: acc.loop
 ! CHECK: acc.cache varPtr(%{{.*}}) bounds(%{{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "data%a(i-4_4:i+4_4)", structured = false}
-! CHECK: hlfir.declare %{{.*}} {uniq_name = "data%a(i-4_4:i+4_4)"}
+! CHECK: hlfir.declare %{{.*}} {{{.*}}uniq_name = "data%a(i-4_4:i+4_4)"}
+! CHECK: acc.yield
+end subroutine
+
+! Test cache with copy of whole struct (not explicit component copyin)
+! CHECK-LABEL: func.func @_QPtest_cache_parallel_copy_struct(
+subroutine test_cache_parallel_copy_struct(data, M)
+  type :: dt
+    real, dimension(:), allocatable :: A
+  end type
+
+  type(dt), intent(inout) :: data
+  integer, intent(in) :: M
+  real :: r
+  integer :: i
+
+  !$acc parallel loop copy(data)
+  do i = 1, M
+    !$acc cache(data%A(i))
+    r = data%A(i)
+  end do
+
+! CHECK: acc.parallel {{.*}} {
+! CHECK: acc.loop
+! CHECK: acc.cache varPtr(%{{.*}}) bounds(%{{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "data%a(i)", structured = false}
+! CHECK: hlfir.declare %{{.*}} {{{.*}}uniq_name = "data%a(i)"}
+! CHECK: acc.yield
+end subroutine
+
+! Test cache with nested derived type in parallel loop with copyin
+! CHECK-LABEL: func.func @_QPtest_cache_nested_parallel(
+subroutine test_cache_nested_parallel(obj, N)
+  type :: inner
+    real, dimension(:), allocatable :: arr
+  end type
+
+  type :: outer
+    type(inner) :: in
+  end type
+
+  type(outer), intent(inout) :: obj
+  integer, intent(in) :: N
+  real :: r
+  integer :: i
+
+  !$acc parallel loop copyin(obj%in%arr(1:N))
+  do i = 1, N
+    !$acc cache(obj%in%arr(i))
+    r = obj%in%arr(i)
+  end do
+
+! CHECK: acc.parallel {{.*}} {
+! CHECK: acc.loop
+! CHECK: acc.cache varPtr(%{{.*}}) bounds(%{{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "obj%in%arr(i)", structured = false}
+! CHECK: hlfir.declare %{{.*}} {{{.*}}uniq_name = "obj%in%arr(i)"}
 ! CHECK: acc.yield
 end subroutine
 

>From 27ea386acf80df062a2de1c91aa8759d63d97871 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 3 Feb 2026 01:46:01 -0800
Subject: [PATCH 5/5] [flang][OpenACC] Use FortranVariableOpInterface for cache
 component refs

Address review feedback: use fir::FortranVariableOpInterface instead
of conditionally checking for hlfir::DesignateOp. This interface is
implemented by both DesignateOp and DeclareOp, so it correctly extracts
shape and type parameters in both cases.

Also add test for explicit shape component in parallel loop with copyin.
---
 flang/lib/Lower/OpenACC.cpp            | 22 +++++++++++-----------
 flang/test/Lower/OpenACC/acc-cache.f90 | 25 +++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index df7970d4243fa..058c3ea2e533d 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4310,19 +4310,19 @@ genACC(Fortran::lower::AbstractConverter &converter,
           extractComponentFromDesignator(designator);
       assert(componentRef &&
              "expected component reference for derived type cache operand");
-      // When component is mapped via copyin, base is the mapped address.
-      mlir::Value shape;
-      llvm::SmallVector<mlir::Value> lenParams;
+      // When component is mapped via a data clause, base may be a declare op
+      // instead of a designate op.
+      auto varIface = base.getDefiningOp<fir::FortranVariableOpInterface>();
+      assert(varIface &&
+             "expected FortranVariableOpInterface for component reference");
       fir::FortranVariableFlagsAttr attrs;
-      if (auto designate = base.getDefiningOp<hlfir::DesignateOp>()) {
-        shape = designate.getShape();
-        lenParams = llvm::SmallVector<mlir::Value>(
-            designate.getTypeparams().begin(), designate.getTypeparams().end());
-        attrs = designate.getFortranAttrsAttr();
-      }
+      if (auto fortranAttrs = varIface.getFortranAttrs())
+        attrs = fir::FortranVariableFlagsAttr::get(builder.getContext(),
+                                                   *fortranAttrs);
       auto declareOp = hlfir::DeclareOp::create(
-          builder, operandLocation, cacheOp.getAccVar(), asFortran.str(), shape,
-          lenParams, /*dummyScope=*/nullptr, /*storage=*/nullptr,
+          builder, operandLocation, cacheOp.getAccVar(), asFortran.str(),
+          varIface.getShape(), varIface.getExplicitTypeParams(),
+          /*dummyScope=*/nullptr, /*storage=*/nullptr,
           /*storageOffset=*/0, attrs);
       converter.getSymbolMap().addComponentOverride(*componentRef, declareOp);
     }
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index bcbd6fa339904..36874d3c21cdb 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -717,6 +717,31 @@ subroutine test_cache_nested_parallel(obj, N)
 ! CHECK: acc.yield
 end subroutine
 
+! Test cache with explicit shape component in parallel loop with copyin
+! CHECK-LABEL: func.func @_QPtest_cache_explicit_shape_comp(
+subroutine test_cache_explicit_shape_comp(data, C, M)
+  type :: dt
+    real, dimension(10) :: A
+  end type
+
+  type(dt), intent(inout) :: data
+  real, dimension(:), intent(out) :: C
+  integer, intent(in) :: M
+  integer :: i
+
+  !$acc parallel loop gang vector copyin(data, data%A(1:M)) copyout(C(1:M))
+  do i = 1, M
+    !$acc cache(data%A(i:i+4))
+    C(i) = data%A(i)
+  end do
+
+! CHECK: acc.parallel {{.*}} {
+! CHECK: acc.loop
+! CHECK: acc.cache varPtr(%{{.*}}) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "data%a(i:i+4_4)", structured = false}
+! CHECK: hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "data%a(i:i+4_4)"}
+! CHECK: acc.yield
+end subroutine
+
 ! Test cache with temporary in designator bounds - verifies local statement context
 ! doesn't cause issues with temporary cleanup
 ! CHECK-LABEL: func.func @_QPtest_cache_temp_in_designator(