[flang-commits] [flang] [flang][HLFIR] Enable inlining of allocatable array assignments (PR #197814)

via flang-commits flang-commits at lists.llvm.org
Tue May 19 08:46:42 PDT 2026


https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/197814

>From be7dd161ac277d7eefb21f7365ed985f03ebbe6c Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 14 May 2026 13:41:25 -0700
Subject: [PATCH 1/8] [flang][HLFIR] Enable inlining of allocatable array
 assignments

Enable the InlineAllocatableExprAssignConversion pattern by default
and add a new InlineAllocatableVarAssignConversion pattern to inline
allocatable assignments with variable RHS when alias analysis proves
no aliasing.

Previously, allocatable array assignments (e.g. tmp = abs(t2-t1))
were always lowered to _FortranAAssign runtime calls. This is
problematic for GPU offloading where runtime calls in gang-redundant
regions require special predication handling.

The expr pattern (already implemented, now enabled by default) handles
hlfir.expr RHS from hlfir.elemental. The new variable pattern handles
non-expr array RHS with realloc-if-needed + element-by-element loop.
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 119 +++++++++++++++++-
 .../inline-hlfir-assign-allocatable-expr.fir  |  26 ++--
 flang/test/HLFIR/inline-hlfir-assign.fir      |  11 +-
 .../Integration/OpenMP/workshare-axpy.f90     |   3 +-
 4 files changed, 140 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 160efede12bd5..f0ea5d3f80022 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -38,7 +38,7 @@ static llvm::cl::opt<bool> inlineAllocatableExprAssignFlag(
     "inline-hlfir-allocatable-expr-assign",
     llvm::cl::desc("Enable inlining of allocatable assignments when RHS is an "
                    "hlfir.expr (e.g., from hlfir.elemental)"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 namespace {
 /// Expand hlfir.assign of array RHS to array LHS into a loop nest
@@ -283,6 +283,119 @@ class InlineAllocatableExprAssignConversion
   }
 };
 
+/// Expand hlfir.assign of a variable RHS to allocatable LHS.
+/// When alias analysis proves no aliasing, we can inline the assignment
+/// as realloc-if-needed + element-by-element copy loop, avoiding the
+/// runtime call.
+///
+/// Example transformation for: allocatable_array = fixed_array
+///   hlfir.assign %var to %alloc realloc : !fir.ref<!fir.array<NxT>>,
+///       !fir.ref<!fir.box<!fir.heap<!fir.array<?xT>>>>
+/// into:
+///   // Realloc if needed (shape mismatch or unallocated)
+///   // Loop over elements:
+///   fir.do_loop %i = %c1 to %extent step %c1 unordered {
+///     %rhs_val = fir.load hlfir.designate %var (%i) : ...
+///     %lhs_elem = hlfir.designate %lhs_box (%i) : ...
+///     hlfir.assign %rhs_val to %lhs_elem : T, !fir.ref<T>
+///   }
+class InlineAllocatableVarAssignConversion
+    : public mlir::OpRewritePattern<hlfir::AssignOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::AssignOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::AssignOp assign,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (!assign.isAllocatableAssignment())
+      return rewriter.notifyMatchFailure(
+          assign, "AssignOp is not an allocatable assignment");
+
+    hlfir::Entity rhs{assign.getRhs()};
+    hlfir::Entity lhs{assign.getLhs()};
+
+    // This pattern handles variable RHS (the expr case is handled by
+    // InlineAllocatableExprAssignConversion).
+    if (mlir::isa<hlfir::ExprType>(rhs.getType()))
+      return rewriter.notifyMatchFailure(assign,
+                                         "RHS is hlfir.expr - use expr pattern");
+
+    if (!rhs.isArray())
+      return rewriter.notifyMatchFailure(assign,
+                                         "AssignOp's RHS is not an array");
+
+    mlir::Type rhsEleTy = rhs.getFortranElementType();
+    if (!fir::isa_trivial(rhsEleTy))
+      return rewriter.notifyMatchFailure(
+          assign, "AssignOp's RHS data type is not trivial");
+
+    mlir::Type lhsEleTy = lhs.getFortranElementType();
+    if (!fir::isa_trivial(lhsEleTy))
+      return rewriter.notifyMatchFailure(
+          assign, "AssignOp's LHS data type is not trivial");
+
+    if (lhsEleTy != rhsEleTy)
+      return rewriter.notifyMatchFailure(assign,
+                                         "RHS/LHS element types mismatch");
+
+    mlir::Type lhsType = lhs.getType();
+    if (!fir::isBoxAddress(lhsType))
+      return rewriter.notifyMatchFailure(assign,
+                                         "LHS is not a reference to a box");
+
+    // Prove LHS and RHS do not alias.
+    fir::AliasAnalysis aliasAnalysis;
+    mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs);
+    if (!aliasRes.isNo()) {
+      fir::ArraySectionAnalyzer::SlicesOverlapKind overlap =
+          fir::ArraySectionAnalyzer::analyze(lhs, rhs);
+      if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::Unknown) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "InlineHLFIRAssign (alloc var):\n"
+                   << "\tLHS: " << lhs << "\n"
+                   << "\tRHS: " << rhs << "\n"
+                   << "\tALIAS: " << aliasRes << "\n");
+        return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias");
+      }
+    }
+
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "InlineHLFIRAssign: inlining allocatable variable assignment\n");
+
+    mlir::Location loc = assign->getLoc();
+    fir::FirOpBuilder builder(rewriter, assign.getOperation());
+    builder.setInsertionPoint(assign);
+
+    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+    llvm::SmallVector<mlir::Value> rhsExtents =
+        hlfir::getIndexExtents(loc, builder, rhsShape);
+
+    mlir::Value lhsBoxRef = lhs.getFirBase();
+    fir::MutableBoxValue mutableBox(lhsBoxRef, /*lenParameters=*/{},
+                                    /*mutableProperties=*/{});
+
+    bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
+    auto storageHandler = [&](fir::ExtendedValue storage) {
+      hlfir::Entity lhsEntity{
+          fir::getBase(fir::factory::createBoxValue(builder, loc, storage))};
+      hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
+                                       useWorkshare,
+                                       /*temporaryLHS=*/false);
+    };
+
+    llvm::SmallVector<mlir::Value> lenParams;
+    fir::factory::MutableBoxReallocation realloc =
+        fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
+                                         lenParams, storageHandler);
+    fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+                                  /*takeLboundsIfRealloc=*/true, realloc);
+
+    rewriter.eraseOp(assign);
+    return mlir::success();
+  }
+};
+
 class InlineHLFIRAssignPass
     : public hlfir::impl::InlineHLFIRAssignBase<InlineHLFIRAssignPass> {
 public:
@@ -299,12 +412,12 @@ class InlineHLFIRAssignPass
     mlir::RewritePatternSet patterns(context);
     patterns.insert<InlineHLFIRAssignConversion>(context);
 
-    // Optionally add the allocatable expr assignment pattern
     if (inlineAllocatableExprAssignFlag) {
       LLVM_DEBUG(llvm::dbgs()
-                 << "InlineHLFIRAssign: enabling allocatable expr assignment "
+                 << "InlineHLFIRAssign: enabling allocatable assignment "
                     "inlining\n");
       patterns.insert<InlineAllocatableExprAssignConversion>(context);
+      patterns.insert<InlineAllocatableVarAssignConversion>(context);
     }
 
     if (mlir::failed(mlir::applyPatternsGreedily(
diff --git a/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir b/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
index 6f3b5ea0eb794..cb9a88bece726 100644
--- a/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
@@ -1,8 +1,8 @@
 // Test inlining of hlfir.assign for allocatable LHS with hlfir.expr RHS.
-// This tests the -inline-hlfir-allocatable-expr-assign flag.
+// Allocatable expr assignment inlining is enabled by default.
 
-// RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s --check-prefix=DEFAULT
-// RUN: fir-opt -inline-hlfir-allocatable-expr-assign --inline-hlfir-assign %s | FileCheck %s --check-prefix=ENABLED
+// RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s --check-prefix=ENABLED
+// RUN: fir-opt -inline-hlfir-allocatable-expr-assign=false --inline-hlfir-assign %s | FileCheck %s --check-prefix=DISABLED
 
 // Test case: c = cos(a) where c is allocatable
 // This is derived from the flang-529628 test case.
@@ -39,9 +39,9 @@ func.func @test_allocatable_elemental_assign(%arg0: !fir.ref<!fir.box<!fir.heap<
   return
 }
 
-// DEFAULT-LABEL: func.func @test_allocatable_elemental_assign
-// By default (without the option), the allocatable assign should NOT be inlined
-// DEFAULT: hlfir.assign %{{.*}} to %{{.*}} realloc : !hlfir.expr<?xf64>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+// DISABLED-LABEL: func.func @test_allocatable_elemental_assign
+// With the option disabled, the allocatable assign should NOT be inlined
+// DISABLED: hlfir.assign %{{.*}} to %{{.*}} realloc : !hlfir.expr<?xf64>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
 
 // ENABLED-LABEL: func.func @test_allocatable_elemental_assign
 // With the option enabled, the assign should be inlined.
@@ -81,9 +81,13 @@ func.func @test_allocatable_nontrivial_type(%arg0: !fir.ref<!fir.box<!fir.heap<!
 // Character types are not trivial, so this should never be inlined
 // ENABLED-LABEL: func.func @test_allocatable_nontrivial_type
 // ENABLED: hlfir.assign %{{.*}} to %{{.*}} realloc : !hlfir.expr<?x!fir.char<1,10>>
+// DISABLED-LABEL: func.func @test_allocatable_nontrivial_type
+// DISABLED: hlfir.assign %{{.*}} to %{{.*}} realloc : !hlfir.expr<?x!fir.char<1,10>>
 
 
-// Test case: Variable RHS (not hlfir.expr) should NOT be inlined by this pattern
+// Test case: Variable RHS (not hlfir.expr).  LHS and RHS are from different
+// function arguments with no aliasing, so the variable-RHS allocatable
+// pattern should inline the assignment.
 func.func @test_allocatable_variable_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, %arg1: !fir.ref<!fir.array<10xf64>>) {
   %c10 = arith.constant 10 : index
   %shape = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -91,11 +95,13 @@ func.func @test_allocatable_variable_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir
   %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
   %b:2 = hlfir.declare %arg1(%shape) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>)
 
-  // Variable RHS - NOT an hlfir.expr, so the allocatable pattern should NOT match
   hlfir.assign %b#0 to %a#0 realloc : !fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
   return
 }
 
-// Variable RHS should keep the original assign (allocatable pattern doesn't match)
 // ENABLED-LABEL: func.func @test_allocatable_variable_rhs
-// ENABLED: hlfir.assign %{{.*}} to %{{.*}} realloc : !fir.ref<!fir.array<10xf64>>
+// ENABLED-NOT: hlfir.assign{{.*}}realloc
+// ENABLED: fir.do_loop
+// ENABLED: hlfir.assign %{{.*}} : f64, !fir.ref<f64>
+// DISABLED-LABEL: func.func @test_allocatable_variable_rhs
+// DISABLED: hlfir.assign %{{.*}} to %{{.*}} realloc : !fir.ref<!fir.array<10xf64>>
diff --git a/flang/test/HLFIR/inline-hlfir-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir
index 797ef6e81946a..3cdc8aeed0ade 100644
--- a/flang/test/HLFIR/inline-hlfir-assign.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign.fir
@@ -165,8 +165,8 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"})
 // CHECK:         }
 
 
-// The LHS is a whole allocatable, so the assignment may imply
-// allocation. This is not currently supported.
+// LHS is a whole allocatable.  RHS is a local array that cannot alias the
+// allocatable, so the assignment is inlined with realloc-if-needed logic.
 func.func @_QPtest4(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
   %c3 = arith.constant 3 : index
   %0:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
@@ -177,9 +177,10 @@ func.func @_QPtest4(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {f
   return
 }
 // CHECK-LABEL:   func.func @_QPtest4(
-// CHECK-NOT:       hlfir.assign
-// CHECK:           hlfir.assign %{{.*}} to %{{.*}} realloc : !fir.ref<!fir.array<3x3xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
-// CHECK-NOT:       hlfir.assign
+// CHECK-NOT:       hlfir.assign{{.*}}realloc
+// CHECK:           fir.do_loop
+// CHECK:             hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref<f32>
+// CHECK-NOT:       hlfir.assign{{.*}}realloc
 
 
 // LHS is a pointer, but RHS is a subroutine local,
diff --git a/flang/test/Integration/OpenMP/workshare-axpy.f90 b/flang/test/Integration/OpenMP/workshare-axpy.f90
index 12246e54d3432..416e455523795 100644
--- a/flang/test/Integration/OpenMP/workshare-axpy.f90
+++ b/flang/test/Integration/OpenMP/workshare-axpy.f90
@@ -48,7 +48,8 @@ subroutine sb1(a, x, y, z)
 ! FIR:      omp.wsloop {
 ! FIR:        omp.loop_nest
 ! FIR:      omp.single nowait {
-! FIR:        fir.call @_FortranAAssign
+! FIR:        fir.if
+! FIR:          fir.freemem
 ! FIR:        fir.freemem
 ! FIR:        omp.terminator
 ! FIR:      }

>From 786d18ef353dc317db77785a801f93464a7d3a3a Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 14 May 2026 14:37:49 -0700
Subject: [PATCH 2/8] clang-format fix

---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp          | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index f0ea5d3f80022..7cd7c6d2a0a0e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -317,8 +317,8 @@ class InlineAllocatableVarAssignConversion
     // This pattern handles variable RHS (the expr case is handled by
     // InlineAllocatableExprAssignConversion).
     if (mlir::isa<hlfir::ExprType>(rhs.getType()))
-      return rewriter.notifyMatchFailure(assign,
-                                         "RHS is hlfir.expr - use expr pattern");
+      return rewriter.notifyMatchFailure(
+          assign, "RHS is hlfir.expr - use expr pattern");
 
     if (!rhs.isArray())
       return rewriter.notifyMatchFailure(assign,
@@ -350,11 +350,10 @@ class InlineAllocatableVarAssignConversion
       fir::ArraySectionAnalyzer::SlicesOverlapKind overlap =
           fir::ArraySectionAnalyzer::analyze(lhs, rhs);
       if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::Unknown) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "InlineHLFIRAssign (alloc var):\n"
-                   << "\tLHS: " << lhs << "\n"
-                   << "\tRHS: " << rhs << "\n"
-                   << "\tALIAS: " << aliasRes << "\n");
+        LLVM_DEBUG(llvm::dbgs() << "InlineHLFIRAssign (alloc var):\n"
+                                << "\tLHS: " << lhs << "\n"
+                                << "\tRHS: " << rhs << "\n"
+                                << "\tALIAS: " << aliasRes << "\n");
         return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias");
       }
     }

>From 66a6f71d6431305b102366baf6ae960bfa8b0ff5 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 15 May 2026 13:43:34 -0700
Subject: [PATCH 3/8] address review comments: refactor, rename flag, add
 lhs.isArray check

- Rename -inline-hlfir-allocatable-expr-assign to
  -inline-hlfir-allocatable-assign since it controls both expr and var
  patterns.
- Add lhs.isArray() check in shared preconditions.
- Extract checkAllocatableAssignPreconditions() and
  genAllocatableInlineAssign() helpers to eliminate code duplication.
- Propagate access groups attribute through the shared helper.
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 328 +++++++-----------
 .../inline-hlfir-assign-allocatable-expr.fir  |   2 +-
 2 files changed, 130 insertions(+), 200 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 7cd7c6d2a0a0e..2b081c0aa0568 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -34,10 +34,9 @@ namespace hlfir {
 
 #define DEBUG_TYPE "inline-hlfir-assign"
 
-static llvm::cl::opt<bool> inlineAllocatableExprAssignFlag(
-    "inline-hlfir-allocatable-expr-assign",
-    llvm::cl::desc("Enable inlining of allocatable assignments when RHS is an "
-                   "hlfir.expr (e.g., from hlfir.elemental)"),
+static llvm::cl::opt<bool> inlineAllocatableAssignFlag(
+    "inline-hlfir-allocatable-assign",
+    llvm::cl::desc("Enable inlining of allocatable array assignments"),
     llvm::cl::init(true));
 
 namespace {
@@ -136,29 +135,86 @@ class InlineHLFIRAssignConversion
   }
 };
 
-/// Expand hlfir.assign of hlfir.expr RHS to allocatable LHS.
-/// When RHS is an hlfir.expr (e.g., from hlfir.elemental), there is no
-/// aliasing concern because expressions don't represent memory locations.
-/// This allows us to inline the assignment even for allocatables.
-///
-/// The generated code:
-/// 1. Gets the shape from the RHS expression
-/// 2. Uses genReallocIfNeeded to handle allocation/reallocation properly
-/// 3. Generates a loop nest to assign elements (via storage handler callback)
-/// 4. Finalizes the reallocation
-///
-/// Example transformation for: allocatable_array = elemental_expr
-///   hlfir.assign %expr to %alloc realloc : !hlfir.expr<?xf64>,
-///                                          !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
-/// into:
-///   // Check allocation status and reallocate if needed
-///   // ... (genReallocIfNeeded handles this) ...
-///   // Loop over elements
-///   fir.do_loop %i = %c1 to %extent step %c1 unordered {
-///     %rhs_val = hlfir.apply %expr, %i : ...
-///     %lhs_elem = hlfir.designate %lhs_box (%i) : ...
-///     hlfir.assign %rhs_val to %lhs_elem : f64, !fir.ref<f64>
-///   }
+/// Check common preconditions for inlining an allocatable array assignment.
+/// Returns success if the assignment can be inlined, failure otherwise.
+static llvm::LogicalResult
+checkAllocatableAssignPreconditions(hlfir::AssignOp assign, hlfir::Entity lhs,
+                                    hlfir::Entity rhs,
+                                    mlir::PatternRewriter &rewriter) {
+  if (!assign.isAllocatableAssignment())
+    return rewriter.notifyMatchFailure(
+        assign, "AssignOp is not an allocatable assignment");
+
+  if (!rhs.isArray())
+    return rewriter.notifyMatchFailure(assign,
+                                       "AssignOp's RHS is not an array");
+
+  if (!lhs.isArray())
+    return rewriter.notifyMatchFailure(assign,
+                                       "AssignOp's LHS is not an array");
+
+  mlir::Type rhsEleTy = rhs.getFortranElementType();
+  if (!fir::isa_trivial(rhsEleTy))
+    return rewriter.notifyMatchFailure(
+        assign, "AssignOp's RHS data type is not trivial");
+
+  mlir::Type lhsEleTy = lhs.getFortranElementType();
+  if (!fir::isa_trivial(lhsEleTy))
+    return rewriter.notifyMatchFailure(
+        assign, "AssignOp's LHS data type is not trivial");
+
+  if (lhsEleTy != rhsEleTy)
+    return rewriter.notifyMatchFailure(assign,
+                                       "RHS/LHS element types mismatch");
+
+  if (!fir::isBoxAddress(lhs.getType()))
+    return rewriter.notifyMatchFailure(assign,
+                                       "LHS is not a reference to a box");
+
+  return mlir::success();
+}
+
+/// Generate realloc-if-needed + element-by-element assignment loop for an
+/// allocatable LHS.  The \p genElementAssign callback is invoked inside the
+/// storage handler with the resolved LHS storage, builder, and metadata.
+static void genAllocatableInlineAssign(
+    hlfir::AssignOp assign, hlfir::Entity rhs, hlfir::Entity lhs,
+    fir::FirOpBuilder &builder,
+    llvm::function_ref<void(fir::FirOpBuilder &, mlir::Location,
+                            hlfir::Entity rhs,
+                            const fir::ExtendedValue &lhsStorage,
+                            bool useWorkshare, mlir::ArrayAttr accessGroups)>
+        genElementAssign) {
+  mlir::Location loc = assign->getLoc();
+  builder.setInsertionPoint(assign);
+
+  mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+  llvm::SmallVector<mlir::Value> rhsExtents =
+      hlfir::getIndexExtents(loc, builder, rhsShape);
+
+  fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
+                                  /*mutableProperties=*/{});
+
+  bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
+  mlir::ArrayAttr accessGroups;
+  if (auto attrs = assign.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+          fir::getAccessGroupsAttrName()))
+    accessGroups = attrs;
+
+  auto storageHandler = [&](fir::ExtendedValue storage) {
+    genElementAssign(builder, loc, rhs, storage, useWorkshare, accessGroups);
+  };
+
+  llvm::SmallVector<mlir::Value> lenParams;
+  fir::factory::MutableBoxReallocation realloc =
+      fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
+                                       lenParams, storageHandler);
+  fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+                                /*takeLboundsIfRealloc=*/true, realloc);
+}
+
+/// Inline hlfir.assign of hlfir.expr RHS to allocatable LHS.
+/// Since hlfir.expr values cannot alias memory, no alias analysis is needed.
 class InlineAllocatableExprAssignConversion
     : public mlir::OpRewritePattern<hlfir::AssignOp> {
 public:
@@ -167,138 +223,52 @@ class InlineAllocatableExprAssignConversion
   llvm::LogicalResult
   matchAndRewrite(hlfir::AssignOp assign,
                   mlir::PatternRewriter &rewriter) const override {
-    // This pattern only handles allocatable assignments
-    if (!assign.isAllocatableAssignment())
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp is not an allocatable assignment");
-
     hlfir::Entity rhs{assign.getRhs()};
     hlfir::Entity lhs{assign.getLhs()};
 
-    // RHS must be an hlfir.expr (this is the key condition - no aliasing)
     if (!mlir::isa<hlfir::ExprType>(rhs.getType()))
-      return rewriter.notifyMatchFailure(
-          assign,
-          "RHS is not an hlfir.expr - cannot inline allocatable assign");
-
-    // RHS must be an array
-    if (!rhs.isArray())
-      return rewriter.notifyMatchFailure(assign,
-                                         "AssignOp's RHS is not an array");
-
-    // Check element types are trivial and match
-    mlir::Type rhsEleTy = rhs.getFortranElementType();
-    if (!fir::isa_trivial(rhsEleTy))
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp's RHS data type is not trivial");
-
-    mlir::Type lhsEleTy = lhs.getFortranElementType();
-    if (!fir::isa_trivial(lhsEleTy))
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp's LHS data type is not trivial");
+      return rewriter.notifyMatchFailure(assign, "RHS is not an hlfir.expr");
 
-    if (lhsEleTy != rhsEleTy)
-      return rewriter.notifyMatchFailure(assign,
-                                         "RHS/LHS element types mismatch");
-
-    // LHS must be a reference to a box (allocatable)
-    mlir::Type lhsType = lhs.getType();
-    if (!fir::isBoxAddress(lhsType))
-      return rewriter.notifyMatchFailure(assign,
-                                         "LHS is not a reference to a box");
+    if (mlir::failed(
+            checkAllocatableAssignPreconditions(assign, lhs, rhs, rewriter)))
+      return mlir::failure();
 
     LLVM_DEBUG(llvm::dbgs()
                << "InlineHLFIRAssign: inlining allocatable expr assignment\n");
 
-    mlir::Location loc = assign->getLoc();
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
-    builder.setInsertionPoint(assign);
+    genAllocatableInlineAssign(
+        assign, rhs, lhs, builder,
+        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
+           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
+           mlir::ArrayAttr accessGroups) {
+          hlfir::Entity lhsEntity{fir::getBase(
+              fir::factory::createBoxValue(builder, loc, lhsStorage))};
+          llvm::SmallVector<mlir::Value> extents =
+              fir::factory::getExtents(loc, builder, lhsStorage);
+          hlfir::LoopNest loopNest = hlfir::genLoopNest(
+              loc, builder, extents, /*isUnordered=*/true, useWorkshare);
+          builder.setInsertionPointToStart(loopNest.body);
+
+          hlfir::Entity rhsElement =
+              hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+          rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
+          hlfir::Entity lhsElement = hlfir::getElementAt(
+              loc, builder, lhsEntity, loopNest.oneBasedIndices);
+          hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
+                                  /*realloc=*/false,
+                                  /*keep_lhs_length_if_realloc=*/false,
+                                  /*temporary_lhs=*/false);
+          builder.setInsertionPointAfter(loopNest.outerOp);
+        });
 
-    // Get the shape of the RHS expression
-    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
-    llvm::SmallVector<mlir::Value> rhsExtents =
-        hlfir::getIndexExtents(loc, builder, rhsShape);
-
-    // Create a MutableBoxValue for the LHS allocatable
-    mlir::Value lhsBoxRef = lhs.getFirBase();
-
-    // Create MutableBoxValue - for trivial types, no length params needed
-    fir::MutableBoxValue mutableBox(lhsBoxRef, /*lenParameters=*/{},
-                                    /*mutableProperties=*/{});
-
-    // Use genReallocIfNeeded to handle allocation/reallocation properly.
-    // This implements Fortran 10.2.1.3 point 3:
-    // - If not allocated, allocate with RHS shape
-    // - If allocated with same shape, keep existing allocation
-    // - If allocated with different shape, reallocate
-    //
-    // The storage handler callback performs the actual assignment loop.
-    bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
-    auto storageHandler = [&](fir::ExtendedValue storage) {
-      hlfir::Entity lhsEntity{
-          fir::getBase(fir::factory::createBoxValue(builder, loc, storage))};
-
-      llvm::SmallVector<mlir::Value> extents =
-          fir::factory::getExtents(loc, builder, storage);
-
-      // Generate loop nest to assign elements
-      hlfir::LoopNest loopNest = hlfir::genLoopNest(
-          loc, builder, extents, /*isUnordered=*/true, useWorkshare);
-      builder.setInsertionPointToStart(loopNest.body);
-
-      // Get RHS element via hlfir.apply
-      hlfir::Entity rhsElement =
-          hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-      rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
-
-      // Get LHS element
-      hlfir::Entity lhsElement = hlfir::getElementAt(loc, builder, lhsEntity,
-                                                     loopNest.oneBasedIndices);
-
-      // Assign the element (scalar, non-allocatable)
-      hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
-                              /*realloc=*/false,
-                              /*keep_lhs_length_if_realloc=*/false,
-                              /*temporary_lhs=*/false);
-
-      // Restore insertion point after loop
-      builder.setInsertionPointAfter(loopNest.outerOp);
-    };
-
-    // No length params for trivial types
-    llvm::SmallVector<mlir::Value> lenParams;
-
-    // Generate reallocation logic with assignment in the callback
-    fir::factory::MutableBoxReallocation realloc =
-        fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
-                                         lenParams, storageHandler);
-
-    // Finalize: free old storage if reallocated and update the mutable box
-    fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
-                                  /*takeLboundsIfRealloc=*/true, realloc);
-
-    // Erase the original assign
     rewriter.eraseOp(assign);
     return mlir::success();
   }
 };
 
-/// Expand hlfir.assign of a variable RHS to allocatable LHS.
-/// When alias analysis proves no aliasing, we can inline the assignment
-/// as realloc-if-needed + element-by-element copy loop, avoiding the
-/// runtime call.
-///
-/// Example transformation for: allocatable_array = fixed_array
-///   hlfir.assign %var to %alloc realloc : !fir.ref<!fir.array<NxT>>,
-///       !fir.ref<!fir.box<!fir.heap<!fir.array<?xT>>>>
-/// into:
-///   // Realloc if needed (shape mismatch or unallocated)
-///   // Loop over elements:
-///   fir.do_loop %i = %c1 to %extent step %c1 unordered {
-///     %rhs_val = fir.load hlfir.designate %var (%i) : ...
-///     %lhs_elem = hlfir.designate %lhs_box (%i) : ...
-///     hlfir.assign %rhs_val to %lhs_elem : T, !fir.ref<T>
-///   }
+/// Inline hlfir.assign of a variable RHS to allocatable LHS.
+/// Alias analysis must prove no aliasing between LHS and RHS.
 class InlineAllocatableVarAssignConversion
     : public mlir::OpRewritePattern<hlfir::AssignOp> {
 public:
@@ -307,43 +277,17 @@ class InlineAllocatableVarAssignConversion
   llvm::LogicalResult
   matchAndRewrite(hlfir::AssignOp assign,
                   mlir::PatternRewriter &rewriter) const override {
-    if (!assign.isAllocatableAssignment())
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp is not an allocatable assignment");
-
     hlfir::Entity rhs{assign.getRhs()};
     hlfir::Entity lhs{assign.getLhs()};
 
-    // This pattern handles variable RHS (the expr case is handled by
-    // InlineAllocatableExprAssignConversion).
     if (mlir::isa<hlfir::ExprType>(rhs.getType()))
       return rewriter.notifyMatchFailure(
           assign, "RHS is hlfir.expr - use expr pattern");
 
-    if (!rhs.isArray())
-      return rewriter.notifyMatchFailure(assign,
-                                         "AssignOp's RHS is not an array");
-
-    mlir::Type rhsEleTy = rhs.getFortranElementType();
-    if (!fir::isa_trivial(rhsEleTy))
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp's RHS data type is not trivial");
+    if (mlir::failed(
+            checkAllocatableAssignPreconditions(assign, lhs, rhs, rewriter)))
+      return mlir::failure();
 
-    mlir::Type lhsEleTy = lhs.getFortranElementType();
-    if (!fir::isa_trivial(lhsEleTy))
-      return rewriter.notifyMatchFailure(
-          assign, "AssignOp's LHS data type is not trivial");
-
-    if (lhsEleTy != rhsEleTy)
-      return rewriter.notifyMatchFailure(assign,
-                                         "RHS/LHS element types mismatch");
-
-    mlir::Type lhsType = lhs.getType();
-    if (!fir::isBoxAddress(lhsType))
-      return rewriter.notifyMatchFailure(assign,
-                                         "LHS is not a reference to a box");
-
-    // Prove LHS and RHS do not alias.
     fir::AliasAnalysis aliasAnalysis;
     mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs);
     if (!aliasRes.isNo()) {
@@ -362,33 +306,19 @@ class InlineAllocatableVarAssignConversion
         llvm::dbgs()
         << "InlineHLFIRAssign: inlining allocatable variable assignment\n");
 
-    mlir::Location loc = assign->getLoc();
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
-    builder.setInsertionPoint(assign);
-
-    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
-    llvm::SmallVector<mlir::Value> rhsExtents =
-        hlfir::getIndexExtents(loc, builder, rhsShape);
-
-    mlir::Value lhsBoxRef = lhs.getFirBase();
-    fir::MutableBoxValue mutableBox(lhsBoxRef, /*lenParameters=*/{},
-                                    /*mutableProperties=*/{});
-
-    bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
-    auto storageHandler = [&](fir::ExtendedValue storage) {
-      hlfir::Entity lhsEntity{
-          fir::getBase(fir::factory::createBoxValue(builder, loc, storage))};
-      hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
-                                       useWorkshare,
-                                       /*temporaryLHS=*/false);
-    };
-
-    llvm::SmallVector<mlir::Value> lenParams;
-    fir::factory::MutableBoxReallocation realloc =
-        fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
-                                         lenParams, storageHandler);
-    fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
-                                  /*takeLboundsIfRealloc=*/true, realloc);
+    genAllocatableInlineAssign(
+        assign, rhs, lhs, builder,
+        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
+           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
+           mlir::ArrayAttr accessGroups) {
+          hlfir::Entity lhsEntity{fir::getBase(
+              fir::factory::createBoxValue(builder, loc, lhsStorage))};
+          hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
+                                           useWorkshare,
+                                           /*temporaryLHS=*/false,
+                                           /*combiner=*/nullptr, accessGroups);
+        });
 
     rewriter.eraseOp(assign);
     return mlir::success();
@@ -411,7 +341,7 @@ class InlineHLFIRAssignPass
     mlir::RewritePatternSet patterns(context);
     patterns.insert<InlineHLFIRAssignConversion>(context);
 
-    if (inlineAllocatableExprAssignFlag) {
+    if (inlineAllocatableAssignFlag) {
       LLVM_DEBUG(llvm::dbgs()
                  << "InlineHLFIRAssign: enabling allocatable assignment "
                     "inlining\n");
diff --git a/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir b/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
index cb9a88bece726..b563b1e1182a1 100644
--- a/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign-allocatable-expr.fir
@@ -2,7 +2,7 @@
 // Allocatable expr assignment inlining is enabled by default.
 
 // RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s --check-prefix=ENABLED
-// RUN: fir-opt -inline-hlfir-allocatable-expr-assign=false --inline-hlfir-assign %s | FileCheck %s --check-prefix=DISABLED
+// RUN: fir-opt -inline-hlfir-allocatable-assign=false --inline-hlfir-assign %s | FileCheck %s --check-prefix=DISABLED
 
 // Test case: c = cos(a) where c is allocatable
 // This is derived from the flang-529628 test case.

>From 967c551a97c0b7ede8d20fc8fea54ff38b9fb3bf Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Mon, 18 May 2026 14:25:19 -0700
Subject: [PATCH 4/8] hoist allocatable reallocation before OpenACC compute
 regions

When an allocatable assignment with realloc semantics appears inside
an acc.kernels/parallel/serial region, hoist the reallocation logic
(genReallocIfNeeded + finalizeRealloc) before the compute region
and keep only the element-wise copy loop inside. This ensures the
LHS array is allocated on the host before the kernel launch, allowing
OpenACC implicit data clauses to handle device memory correctly.

Without this, the allocation happens on the device via malloc and the
host descriptor remains null, causing crashes when accessing the
array after the compute region.
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 190 +++++++++++++++---
 1 file changed, 164 insertions(+), 26 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 2b081c0aa0568..c43ad50876edc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -174,6 +175,108 @@ checkAllocatableAssignPreconditions(hlfir::AssignOp assign, hlfir::Entity lhs,
   return mlir::success();
 }
 
+/// Return the enclosing OpenACC compute op (kernels/parallel/serial), or null.
+static mlir::Operation *getEnclosingAccComputeOp(mlir::Operation *op) {
+  while (mlir::Operation *parent = op->getParentOp()) {
+    if (mlir::isa<mlir::acc::KernelsOp, mlir::acc::ParallelOp,
+                  mlir::acc::SerialOp>(parent))
+      return parent;
+    op = parent;
+  }
+  return nullptr;
+}
+
+/// Find a box reference suitable for deriving array extents on the host.
+/// Traces the RHS value back through loads and elementals to find a box
+/// ref defined outside the compute region.
+static mlir::Value findSourceBoxRef(hlfir::Entity rhs) {
+  mlir::Value rhsVal = rhs;
+
+  if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
+    // Var case: RHS is a loaded box — trace to the ref.
+    if (auto loadOp = rhsVal.getDefiningOp<fir::LoadOp>())
+      return loadOp.getMemref();
+    return {};
+  }
+
+  // Expr case: RHS is hlfir.expr from hlfir.elemental.
+  // Trace shape → fir.box_dims → fir.load → ref.
+  auto elemOp = rhsVal.getDefiningOp<hlfir::ElementalOp>();
+  if (!elemOp)
+    return {};
+  auto shapeOp = elemOp.getShape().getDefiningOp<fir::ShapeOp>();
+  if (!shapeOp)
+    return {};
+  for (mlir::Value ext : shapeOp.getExtents()) {
+    auto boxDimsOp = ext.getDefiningOp<fir::BoxDimsOp>();
+    if (!boxDimsOp)
+      continue;
+    if (auto loadOp = boxDimsOp.getVal().getDefiningOp<fir::LoadOp>())
+      return loadOp.getMemref();
+  }
+  return {};
+}
+
+/// Hoist the reallocation of an allocatable LHS before \p accOp on the host,
+/// then generate the copy loop at the original assign position inside the
+/// compute region.  Returns true on success.
+static bool hoistReallocBeforeAccRegion(
+    hlfir::AssignOp assign, hlfir::Entity rhs, hlfir::Entity lhs,
+    fir::FirOpBuilder &builder, mlir::Operation *accOp,
+    llvm::function_ref<void(fir::FirOpBuilder &, mlir::Location,
+                            hlfir::Entity rhs, hlfir::Entity lhsEntity,
+                            bool useWorkshare, mlir::ArrayAttr accessGroups)>
+        genCopyLoop) {
+  mlir::Location loc = assign->getLoc();
+  unsigned rank = rhs.getRank();
+  if (rank == 0)
+    return false;
+
+  mlir::Value sourceBoxRef = findSourceBoxRef(rhs);
+  if (!sourceBoxRef)
+    return false;
+
+  // 1. Set insertion before the acc compute region.
+  builder.setInsertionPoint(accOp);
+
+  // Derive RHS extents on the host.
+  mlir::Value hostBox = fir::LoadOp::create(builder, loc, sourceBoxRef);
+  llvm::SmallVector<mlir::Value> hostExtents;
+  for (unsigned i = 0; i < rank; ++i) {
+    mlir::Value dim =
+        builder.createIntegerConstant(loc, builder.getIndexType(), i);
+    auto dims = fir::BoxDimsOp::create(builder, loc, builder.getIndexType(),
+                                       builder.getIndexType(),
+                                       builder.getIndexType(), hostBox, dim);
+    hostExtents.push_back(dims.getResult(1));
+  }
+
+  // Generate realloc-if-needed on the host (no-op storage handler).
+  fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
+                                  /*mutableProperties=*/{});
+  auto noopHandler = [](fir::ExtendedValue) {};
+  llvm::SmallVector<mlir::Value> lenParams;
+  fir::factory::MutableBoxReallocation realloc =
+      fir::factory::genReallocIfNeeded(builder, loc, mutableBox, hostExtents,
+                                       lenParams, noopHandler);
+  fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+                                /*takeLboundsIfRealloc=*/true, realloc);
+
+  // 2. Generate the copy loop inside the compute region.
+  builder.setInsertionPoint(assign);
+  mlir::Value lhsBox = fir::LoadOp::create(builder, loc, lhs.getFirBase());
+  hlfir::Entity lhsEntity{lhsBox};
+
+  bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
+  mlir::ArrayAttr accessGroups;
+  if (auto attrs = assign.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+          fir::getAccessGroupsAttrName()))
+    accessGroups = attrs;
+
+  genCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare, accessGroups);
+  return true;
+}
+
 /// Generate realloc-if-needed + element-by-element assignment loop for an
 /// allocatable LHS.  The \p genElementAssign callback is invoked inside the
 /// storage handler with the resolved LHS storage, builder, and metadata.
@@ -237,29 +340,49 @@ class InlineAllocatableExprAssignConversion
                << "InlineHLFIRAssign: inlining allocatable expr assignment\n");
 
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
+
+    // When inside an ACC compute region, hoist the reallocation to the host
+    // and keep only the copy loop on the device.
+    auto exprCopyLoop = [](fir::FirOpBuilder &builder, mlir::Location loc,
+                           hlfir::Entity rhs, hlfir::Entity lhsEntity,
+                           bool useWorkshare, mlir::ArrayAttr accessGroups) {
+      mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+      llvm::SmallVector<mlir::Value> extents =
+          hlfir::getIndexExtents(loc, builder, rhsShape);
+      hlfir::LoopNest loopNest = hlfir::genLoopNest(
+          loc, builder, extents, /*isUnordered=*/true, useWorkshare);
+      builder.setInsertionPointToStart(loopNest.body);
+
+      hlfir::Entity rhsElement =
+          hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+      rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
+      hlfir::Entity lhsElement = hlfir::getElementAt(loc, builder, lhsEntity,
+                                                     loopNest.oneBasedIndices);
+      hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
+                              /*realloc=*/false,
+                              /*keep_lhs_length_if_realloc=*/false,
+                              /*temporary_lhs=*/false);
+      builder.setInsertionPointAfter(loopNest.outerOp);
+    };
+
+    if (auto *accOp = getEnclosingAccComputeOp(assign)) {
+      if (hoistReallocBeforeAccRegion(assign, rhs, lhs, builder, accOp,
+                                      exprCopyLoop)) {
+        rewriter.eraseOp(assign);
+        return mlir::success();
+      }
+    }
+
+    // Default path: realloc + copy together at the assign position.
     genAllocatableInlineAssign(
         assign, rhs, lhs, builder,
-        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
-           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
-           mlir::ArrayAttr accessGroups) {
+        [&exprCopyLoop](fir::FirOpBuilder &builder, mlir::Location loc,
+                        hlfir::Entity rhs, const fir::ExtendedValue &lhsStorage,
+                        bool useWorkshare, mlir::ArrayAttr accessGroups) {
           hlfir::Entity lhsEntity{fir::getBase(
               fir::factory::createBoxValue(builder, loc, lhsStorage))};
-          llvm::SmallVector<mlir::Value> extents =
-              fir::factory::getExtents(loc, builder, lhsStorage);
-          hlfir::LoopNest loopNest = hlfir::genLoopNest(
-              loc, builder, extents, /*isUnordered=*/true, useWorkshare);
-          builder.setInsertionPointToStart(loopNest.body);
-
-          hlfir::Entity rhsElement =
-              hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-          rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
-          hlfir::Entity lhsElement = hlfir::getElementAt(
-              loc, builder, lhsEntity, loopNest.oneBasedIndices);
-          hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
-                                  /*realloc=*/false,
-                                  /*keep_lhs_length_if_realloc=*/false,
-                                  /*temporary_lhs=*/false);
-          builder.setInsertionPointAfter(loopNest.outerOp);
+          exprCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare,
+                       accessGroups);
         });
 
     rewriter.eraseOp(assign);
@@ -307,17 +430,32 @@ class InlineAllocatableVarAssignConversion
         << "InlineHLFIRAssign: inlining allocatable variable assignment\n");
 
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
+
+    auto varCopyLoop = [](fir::FirOpBuilder &builder, mlir::Location loc,
+                          hlfir::Entity rhs, hlfir::Entity lhsEntity,
+                          bool useWorkshare, mlir::ArrayAttr accessGroups) {
+      hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
+                                       useWorkshare,
+                                       /*temporaryLHS=*/false,
+                                       /*combiner=*/nullptr, accessGroups);
+    };
+
+    if (auto *accOp = getEnclosingAccComputeOp(assign)) {
+      if (hoistReallocBeforeAccRegion(assign, rhs, lhs, builder, accOp,
+                                      varCopyLoop)) {
+        rewriter.eraseOp(assign);
+        return mlir::success();
+      }
+    }
+
     genAllocatableInlineAssign(
         assign, rhs, lhs, builder,
-        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
-           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
-           mlir::ArrayAttr accessGroups) {
+        [&varCopyLoop](fir::FirOpBuilder &builder, mlir::Location loc,
+                       hlfir::Entity rhs, const fir::ExtendedValue &lhsStorage,
+                       bool useWorkshare, mlir::ArrayAttr accessGroups) {
           hlfir::Entity lhsEntity{fir::getBase(
               fir::factory::createBoxValue(builder, loc, lhsStorage))};
-          hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
-                                           useWorkshare,
-                                           /*temporaryLHS=*/false,
-                                           /*combiner=*/nullptr, accessGroups);
+          varCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare, accessGroups);
         });
 
     rewriter.eraseOp(assign);

>From ca44670ae2cbc259537503a087d5f7038172a39b Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Mon, 18 May 2026 14:34:08 -0700
Subject: [PATCH 5/8] Revert "hoist allocatable reallocation before OpenACC
 compute regions"

This reverts commit 967c551a97c0b7ede8d20fc8fea54ff38b9fb3bf.
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 190 +++---------------
 1 file changed, 26 insertions(+), 164 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index c43ad50876edc..2b081c0aa0568 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -20,7 +20,6 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
-#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -175,108 +174,6 @@ checkAllocatableAssignPreconditions(hlfir::AssignOp assign, hlfir::Entity lhs,
   return mlir::success();
 }
 
-/// Return the enclosing OpenACC compute op (kernels/parallel/serial), or null.
-static mlir::Operation *getEnclosingAccComputeOp(mlir::Operation *op) {
-  while (mlir::Operation *parent = op->getParentOp()) {
-    if (mlir::isa<mlir::acc::KernelsOp, mlir::acc::ParallelOp,
-                  mlir::acc::SerialOp>(parent))
-      return parent;
-    op = parent;
-  }
-  return nullptr;
-}
-
-/// Find a box reference suitable for deriving array extents on the host.
-/// Traces the RHS value back through loads and elementals to find a box
-/// ref defined outside the compute region.
-static mlir::Value findSourceBoxRef(hlfir::Entity rhs) {
-  mlir::Value rhsVal = rhs;
-
-  if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
-    // Var case: RHS is a loaded box — trace to the ref.
-    if (auto loadOp = rhsVal.getDefiningOp<fir::LoadOp>())
-      return loadOp.getMemref();
-    return {};
-  }
-
-  // Expr case: RHS is hlfir.expr from hlfir.elemental.
-  // Trace shape → fir.box_dims → fir.load → ref.
-  auto elemOp = rhsVal.getDefiningOp<hlfir::ElementalOp>();
-  if (!elemOp)
-    return {};
-  auto shapeOp = elemOp.getShape().getDefiningOp<fir::ShapeOp>();
-  if (!shapeOp)
-    return {};
-  for (mlir::Value ext : shapeOp.getExtents()) {
-    auto boxDimsOp = ext.getDefiningOp<fir::BoxDimsOp>();
-    if (!boxDimsOp)
-      continue;
-    if (auto loadOp = boxDimsOp.getVal().getDefiningOp<fir::LoadOp>())
-      return loadOp.getMemref();
-  }
-  return {};
-}
-
-/// Hoist the reallocation of an allocatable LHS before \p accOp on the host,
-/// then generate the copy loop at the original assign position inside the
-/// compute region.  Returns true on success.
-static bool hoistReallocBeforeAccRegion(
-    hlfir::AssignOp assign, hlfir::Entity rhs, hlfir::Entity lhs,
-    fir::FirOpBuilder &builder, mlir::Operation *accOp,
-    llvm::function_ref<void(fir::FirOpBuilder &, mlir::Location,
-                            hlfir::Entity rhs, hlfir::Entity lhsEntity,
-                            bool useWorkshare, mlir::ArrayAttr accessGroups)>
-        genCopyLoop) {
-  mlir::Location loc = assign->getLoc();
-  unsigned rank = rhs.getRank();
-  if (rank == 0)
-    return false;
-
-  mlir::Value sourceBoxRef = findSourceBoxRef(rhs);
-  if (!sourceBoxRef)
-    return false;
-
-  // 1. Set insertion before the acc compute region.
-  builder.setInsertionPoint(accOp);
-
-  // Derive RHS extents on the host.
-  mlir::Value hostBox = fir::LoadOp::create(builder, loc, sourceBoxRef);
-  llvm::SmallVector<mlir::Value> hostExtents;
-  for (unsigned i = 0; i < rank; ++i) {
-    mlir::Value dim =
-        builder.createIntegerConstant(loc, builder.getIndexType(), i);
-    auto dims = fir::BoxDimsOp::create(builder, loc, builder.getIndexType(),
-                                       builder.getIndexType(),
-                                       builder.getIndexType(), hostBox, dim);
-    hostExtents.push_back(dims.getResult(1));
-  }
-
-  // Generate realloc-if-needed on the host (no-op storage handler).
-  fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
-                                  /*mutableProperties=*/{});
-  auto noopHandler = [](fir::ExtendedValue) {};
-  llvm::SmallVector<mlir::Value> lenParams;
-  fir::factory::MutableBoxReallocation realloc =
-      fir::factory::genReallocIfNeeded(builder, loc, mutableBox, hostExtents,
-                                       lenParams, noopHandler);
-  fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
-                                /*takeLboundsIfRealloc=*/true, realloc);
-
-  // 2. Generate the copy loop inside the compute region.
-  builder.setInsertionPoint(assign);
-  mlir::Value lhsBox = fir::LoadOp::create(builder, loc, lhs.getFirBase());
-  hlfir::Entity lhsEntity{lhsBox};
-
-  bool useWorkshare = flangomp::shouldUseWorkshareLowering(assign);
-  mlir::ArrayAttr accessGroups;
-  if (auto attrs = assign.getOperation()->getAttrOfType<mlir::ArrayAttr>(
-          fir::getAccessGroupsAttrName()))
-    accessGroups = attrs;
-
-  genCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare, accessGroups);
-  return true;
-}
-
 /// Generate realloc-if-needed + element-by-element assignment loop for an
 /// allocatable LHS.  The \p genElementAssign callback is invoked inside the
 /// storage handler with the resolved LHS storage, builder, and metadata.
@@ -340,49 +237,29 @@ class InlineAllocatableExprAssignConversion
                << "InlineHLFIRAssign: inlining allocatable expr assignment\n");
 
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
-
-    // When inside an ACC compute region, hoist the reallocation to the host
-    // and keep only the copy loop on the device.
-    auto exprCopyLoop = [](fir::FirOpBuilder &builder, mlir::Location loc,
-                           hlfir::Entity rhs, hlfir::Entity lhsEntity,
-                           bool useWorkshare, mlir::ArrayAttr accessGroups) {
-      mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
-      llvm::SmallVector<mlir::Value> extents =
-          hlfir::getIndexExtents(loc, builder, rhsShape);
-      hlfir::LoopNest loopNest = hlfir::genLoopNest(
-          loc, builder, extents, /*isUnordered=*/true, useWorkshare);
-      builder.setInsertionPointToStart(loopNest.body);
-
-      hlfir::Entity rhsElement =
-          hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-      rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
-      hlfir::Entity lhsElement = hlfir::getElementAt(loc, builder, lhsEntity,
-                                                     loopNest.oneBasedIndices);
-      hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
-                              /*realloc=*/false,
-                              /*keep_lhs_length_if_realloc=*/false,
-                              /*temporary_lhs=*/false);
-      builder.setInsertionPointAfter(loopNest.outerOp);
-    };
-
-    if (auto *accOp = getEnclosingAccComputeOp(assign)) {
-      if (hoistReallocBeforeAccRegion(assign, rhs, lhs, builder, accOp,
-                                      exprCopyLoop)) {
-        rewriter.eraseOp(assign);
-        return mlir::success();
-      }
-    }
-
-    // Default path: realloc + copy together at the assign position.
     genAllocatableInlineAssign(
         assign, rhs, lhs, builder,
-        [&exprCopyLoop](fir::FirOpBuilder &builder, mlir::Location loc,
-                        hlfir::Entity rhs, const fir::ExtendedValue &lhsStorage,
-                        bool useWorkshare, mlir::ArrayAttr accessGroups) {
+        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
+           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
+           mlir::ArrayAttr accessGroups) {
           hlfir::Entity lhsEntity{fir::getBase(
               fir::factory::createBoxValue(builder, loc, lhsStorage))};
-          exprCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare,
-                       accessGroups);
+          llvm::SmallVector<mlir::Value> extents =
+              fir::factory::getExtents(loc, builder, lhsStorage);
+          hlfir::LoopNest loopNest = hlfir::genLoopNest(
+              loc, builder, extents, /*isUnordered=*/true, useWorkshare);
+          builder.setInsertionPointToStart(loopNest.body);
+
+          hlfir::Entity rhsElement =
+              hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+          rhsElement = hlfir::loadTrivialScalar(loc, builder, rhsElement);
+          hlfir::Entity lhsElement = hlfir::getElementAt(
+              loc, builder, lhsEntity, loopNest.oneBasedIndices);
+          hlfir::AssignOp::create(builder, loc, rhsElement, lhsElement,
+                                  /*realloc=*/false,
+                                  /*keep_lhs_length_if_realloc=*/false,
+                                  /*temporary_lhs=*/false);
+          builder.setInsertionPointAfter(loopNest.outerOp);
         });
 
     rewriter.eraseOp(assign);
@@ -430,32 +307,17 @@ class InlineAllocatableVarAssignConversion
         << "InlineHLFIRAssign: inlining allocatable variable assignment\n");
 
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
-
-    auto varCopyLoop = [](fir::FirOpBuilder &builder, mlir::Location loc,
-                          hlfir::Entity rhs, hlfir::Entity lhsEntity,
-                          bool useWorkshare, mlir::ArrayAttr accessGroups) {
-      hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
-                                       useWorkshare,
-                                       /*temporaryLHS=*/false,
-                                       /*combiner=*/nullptr, accessGroups);
-    };
-
-    if (auto *accOp = getEnclosingAccComputeOp(assign)) {
-      if (hoistReallocBeforeAccRegion(assign, rhs, lhs, builder, accOp,
-                                      varCopyLoop)) {
-        rewriter.eraseOp(assign);
-        return mlir::success();
-      }
-    }
-
     genAllocatableInlineAssign(
         assign, rhs, lhs, builder,
-        [&varCopyLoop](fir::FirOpBuilder &builder, mlir::Location loc,
-                       hlfir::Entity rhs, const fir::ExtendedValue &lhsStorage,
-                       bool useWorkshare, mlir::ArrayAttr accessGroups) {
+        [](fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity rhs,
+           const fir::ExtendedValue &lhsStorage, bool useWorkshare,
+           mlir::ArrayAttr accessGroups) {
           hlfir::Entity lhsEntity{fir::getBase(
               fir::factory::createBoxValue(builder, loc, lhsStorage))};
-          varCopyLoop(builder, loc, rhs, lhsEntity, useWorkshare, accessGroups);
+          hlfir::genNoAliasArrayAssignment(loc, builder, rhs, lhsEntity,
+                                           useWorkshare,
+                                           /*temporaryLHS=*/false,
+                                           /*combiner=*/nullptr, accessGroups);
         });
 
     rewriter.eraseOp(assign);

>From 7a0f260ce3978277fd835ab052c9b2efd367c2d5 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 19 May 2026 02:09:29 -0700
Subject: [PATCH 6/8] add SeparateAllocatableAssign pass to split realloc from
 assign

Add a new HLFIR pass that transforms hlfir.assign ... realloc into
conditional reallocation followed by a non-realloc hlfir.assign.
This decouples host-side allocation from device-side computation,
which is required for OpenACC/OpenMP offloading correctness.

The pass runs unconditionally at all optimization levels, before
InlineHLFIRAssign.
---
 flang/include/flang/Optimizer/HLFIR/Passes.td |  10 ++
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |   1 +
 .../Transforms/SeparateAllocatableAssign.cpp  | 140 ++++++++++++++++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   2 +
 .../test/Driver/mlir-debug-pass-pipeline.f90  |   5 +
 flang/test/Driver/mlir-pass-pipeline.f90      |   5 +
 .../HLFIR/separate-allocatable-assign.fir     |  98 ++++++++++++
 .../Integration/OpenMP/workshare-axpy.f90     |  12 +-
 8 files changed, 264 insertions(+), 9 deletions(-)
 create mode 100644 flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
 create mode 100644 flang/test/HLFIR/separate-allocatable-assign.fir

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index dbce6c3c4ebc9..916e80b53c91f 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -84,6 +84,16 @@ def InlineElementals : Pass<"inline-elementals"> {
   let summary = "Inline chained hlfir.elemental operations";
 }
 
+def SeparateAllocatableAssign : Pass<"separate-allocatable-assign"> {
+  let summary = "Separate reallocation from allocatable array assignments";
+  let description = [{
+    Transform `hlfir.assign %rhs to %lhs realloc` into a conditional
+    reallocation of the LHS followed by a non-realloc `hlfir.assign`.
+    This separates host-side allocation from device-side computation
+    for OpenACC/OpenMP offloading and runs at all optimization levels.
+  }];
+}
+
 def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
   let summary = "Inline hlfir.assign operations";
 }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 5c24fe58b05c4..c0c64c19e3826 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_flang_library(HLFIRTransforms
   ExpressionSimplification.cpp
   InlineElementals.cpp
   InlineHLFIRAssign.cpp
+  SeparateAllocatableAssign.cpp
   InlineHLFIRCopyIn.cpp
   LowerHLFIRIntrinsics.cpp
   LowerHLFIROrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
new file mode 100644
index 0000000000000..4a27b30863ac8
--- /dev/null
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
@@ -0,0 +1,140 @@
+//===- SeparateAllocatableAssign.cpp - Split realloc from assign
+//-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Transform hlfir.assign with realloc semantics into a conditional
+// reallocation of the LHS followed by a plain hlfir.assign (without realloc).
+//
+// Before:
+//   hlfir.assign %rhs to %lhs realloc
+//
+// After:
+//   %shape = shape_of(%rhs)
+//   %new_lhs = genReallocIfNeeded(%lhs, %shape)  // host-side alloc
+//   hlfir.assign %rhs to %new_lhs                // element copy
+//
+// This is useful for OpenACC/OpenMP offloading where the allocation must
+// happen on the host before entering a device compute region.
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+namespace hlfir {
+#define GEN_PASS_DEF_SEPARATEALLOCATABLEASSIGN
+#include "flang/Optimizer/HLFIR/Passes.h.inc"
+} // namespace hlfir
+
+#define DEBUG_TYPE "separate-allocatable-assign"
+
+namespace {
+
+class SeparateAllocatableAssignConversion
+    : public mlir::OpRewritePattern<hlfir::AssignOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::AssignOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::AssignOp assign,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (!assign.isAllocatableAssignment())
+      return rewriter.notifyMatchFailure(assign, "not an allocatable assign");
+
+    hlfir::Entity rhs{assign.getRhs()};
+    hlfir::Entity lhs{assign.getLhs()};
+
+    if (!rhs.isArray())
+      return rewriter.notifyMatchFailure(assign, "RHS is not an array");
+
+    if (!lhs.isArray())
+      return rewriter.notifyMatchFailure(assign, "LHS is not an array");
+
+    mlir::Type rhsEleTy = rhs.getFortranElementType();
+    if (!fir::isa_trivial(rhsEleTy))
+      return rewriter.notifyMatchFailure(assign, "RHS type is not trivial");
+
+    mlir::Type lhsEleTy = lhs.getFortranElementType();
+    if (!fir::isa_trivial(lhsEleTy))
+      return rewriter.notifyMatchFailure(assign, "LHS type is not trivial");
+
+    if (lhsEleTy != rhsEleTy)
+      return rewriter.notifyMatchFailure(assign, "element type mismatch");
+
+    if (!fir::isBoxAddress(lhs.getType()))
+      return rewriter.notifyMatchFailure(assign, "LHS is not a box address");
+
+    LLVM_DEBUG(llvm::dbgs() << "SeparateAllocatableAssign: splitting realloc "
+                               "from assign\n");
+
+    mlir::Location loc = assign->getLoc();
+    fir::FirOpBuilder builder(rewriter, assign.getOperation());
+    builder.setInsertionPoint(assign);
+
+    // Get the shape of the RHS for reallocation.
+    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+    llvm::SmallVector<mlir::Value> rhsExtents =
+        hlfir::getIndexExtents(loc, builder, rhsShape);
+
+    // Perform conditional reallocation of the LHS.
+    fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
+                                    /*mutableProperties=*/{});
+
+    auto noopHandler = [](fir::ExtendedValue) {};
+    llvm::SmallVector<mlir::Value> lenParams;
+    fir::factory::MutableBoxReallocation realloc =
+        fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
+                                         lenParams, noopHandler);
+    fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+                                  /*takeLboundsIfRealloc=*/true, realloc);
+
+    // Load the (now properly allocated) LHS box and emit a non-realloc assign.
+    mlir::Value lhsBox = fir::LoadOp::create(builder, loc, lhs.getFirBase());
+    hlfir::AssignOp::create(builder, loc, rhs, lhsBox,
+                            /*realloc=*/false,
+                            /*keep_lhs_length_if_realloc=*/false,
+                            assign.isTemporaryLHS());
+
+    rewriter.eraseOp(assign);
+    return mlir::success();
+  }
+};
+
+class SeparateAllocatableAssignPass
+    : public hlfir::impl::SeparateAllocatableAssignBase<
+          SeparateAllocatableAssignPass> {
+public:
+  using SeparateAllocatableAssignBase<
+      SeparateAllocatableAssignPass>::SeparateAllocatableAssignBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext *context = &getContext();
+
+    mlir::GreedyRewriteConfig config;
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
+
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<SeparateAllocatableAssignConversion>(context);
+
+    if (mlir::failed(mlir::applyPatternsGreedily(
+            getOperation(), std::move(patterns), config))) {
+      mlir::emitError(getOperation()->getLoc(),
+                      "failure in separate-allocatable-assign");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 920d6f86a355e..38721a22f4db5 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -291,6 +291,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
   }
   addNestedPassToAllTopLevelOperations<PassConstructor>(
       pm, hlfir::createInlineElementals);
+  addNestedPassToAllTopLevelOperations<PassConstructor>(
+      pm, hlfir::createSeparateAllocatableAssign);
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
     pm.addPass(mlir::createCSEPass());
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 3f6bde2ded67b..f0bf8cbf6d1cf 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -31,14 +31,19 @@
 ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_mapper', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT:   SeparateAllocatableAssign
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT:   SeparateAllocatableAssign
 ! ALL-NEXT: 'omp.declare_mapper' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT:   SeparateAllocatableAssign
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT:   SeparateAllocatableAssign
 ! ALL-NEXT: 'omp.private' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT:   SeparateAllocatableAssign
 ! ALL-NEXT: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 630076a7947ff..b44f54ed8b46f 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -31,18 +31,23 @@
 ! ALL-NEXT:'fir.global' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:  SeparateAllocatableAssign
 ! ALL-NEXT:'func.func' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:  SeparateAllocatableAssign
 ! ALL-NEXT:'omp.declare_mapper' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:  SeparateAllocatableAssign
 ! ALL-NEXT:'omp.declare_reduction' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:  SeparateAllocatableAssign
 ! ALL-NEXT:'omp.private' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:  SeparateAllocatableAssign
 ! O2-NEXT: Canonicalizer
 ! O2-NEXT: CSE
 ! O2-NEXT: (S) {{.*}} num-cse'd
diff --git a/flang/test/HLFIR/separate-allocatable-assign.fir b/flang/test/HLFIR/separate-allocatable-assign.fir
new file mode 100644
index 0000000000000..40f2d14b6e628
--- /dev/null
+++ b/flang/test/HLFIR/separate-allocatable-assign.fir
@@ -0,0 +1,98 @@
+// Test the separate-allocatable-assign pass.
+// It should transform hlfir.assign ... realloc into conditional reallocation
+// followed by a non-realloc hlfir.assign.
+
+// RUN: fir-opt --separate-allocatable-assign %s | FileCheck %s
+
+// Test: allocatable array assignment with elemental RHS
+func.func @test_expr_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, %arg1: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  %c:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+
+  %a_box = fir.load %a#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %dims:3 = fir.box_dims %a_box, %c0 : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> (index, index, index)
+  %shape = fir.shape %dims#1 : (index) -> !fir.shape<1>
+
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?xf64> {
+  ^bb0(%i: index):
+    %lb_offset = arith.subi %dims#0, %c1 : index
+    %idx = arith.addi %i, %lb_offset : index
+    %a_elem = hlfir.designate %a_box (%idx) : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> !fir.ref<f64>
+    %a_val = fir.load %a_elem : !fir.ref<f64>
+    %cos_val = math.cos %a_val fastmath<contract> : f64
+    hlfir.yield_element %cos_val : f64
+  }
+
+  hlfir.assign %elemental to %c#0 realloc : !hlfir.expr<?xf64>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  hlfir.destroy %elemental : !hlfir.expr<?xf64>
+  return
+}
+
+// CHECK-LABEL: func.func @test_expr_rhs
+// The realloc assign should be separated into realloc + non-realloc assign.
+// CHECK-NOT: hlfir.assign{{.*}}realloc
+// CHECK: fir.if
+// CHECK: fir.allocmem
+// CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+// CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !hlfir.expr<?xf64>, !fir.box<!fir.heap<!fir.array<?xf64>>>
+
+// Test: allocatable array assignment with variable RHS
+func.func @test_var_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, %arg1: !fir.ref<!fir.array<10xf64>>) {
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  %b:2 = hlfir.declare %arg1(%shape) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>)
+
+  hlfir.assign %b#0 to %a#0 realloc : !fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  return
+}
+
+// CHECK-LABEL: func.func @test_var_rhs
+// CHECK-NOT: hlfir.assign{{.*}}realloc
+// CHECK: fir.if
+// CHECK: fir.allocmem
+// CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+// CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !fir.ref<!fir.array<10xf64>>, !fir.box<!fir.heap<!fir.array<?xf64>>>
+
+// Test: non-trivial element type should NOT be separated
+func.func @test_nontrivial(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) {
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>)
+
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.char<1,10>> {
+  ^bb0(%i: index):
+    %str = fir.undefined !fir.char<1,10>
+    hlfir.yield_element %str : !fir.char<1,10>
+  }
+
+  hlfir.assign %elemental to %a#0 realloc : !hlfir.expr<?x!fir.char<1,10>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>
+  hlfir.destroy %elemental : !hlfir.expr<?x!fir.char<1,10>>
+  return
+}
+
+// CHECK-LABEL: func.func @test_nontrivial
+// Character types are not trivial, so the assign should remain
+// CHECK: hlfir.assign %{{.*}} to %{{.*}} realloc
+
+// Test: non-allocatable assign should NOT be modified
+func.func @test_non_allocatable(%arg0: !fir.ref<!fir.array<10xf64>>, %arg1: !fir.ref<!fir.array<10xf64>>) {
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  %a:2 = hlfir.declare %arg0(%shape) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>)
+  %b:2 = hlfir.declare %arg1(%shape) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>)
+
+  hlfir.assign %b#0 to %a#0 : !fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>
+  return
+}
+
+// CHECK-LABEL: func.func @test_non_allocatable
+// Non-allocatable assign should pass through unchanged
+// CHECK: hlfir.assign %{{.*}} to %{{.*}} : !fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>
+// CHECK-NOT: fir.if
diff --git a/flang/test/Integration/OpenMP/workshare-axpy.f90 b/flang/test/Integration/OpenMP/workshare-axpy.f90
index 416e455523795..bb5fba751cbda 100644
--- a/flang/test/Integration/OpenMP/workshare-axpy.f90
+++ b/flang/test/Integration/OpenMP/workshare-axpy.f90
@@ -38,21 +38,15 @@ subroutine sb1(a, x, y, z)
 ! HLFIR:}
 
 
-! FIR:  func.func private @_workshare_copy_heap_Uxi32(%{{[a-z0-9]+}}: !fir.ref<!fir.heap<!fir.array<?xi32>>>, %{{[a-z0-9]+}}: !fir.ref<!fir.heap<!fir.array<?xi32>>>
+! FIR:  func.func private @_workshare_copy_box_heap_Uxi32(%{{[a-z0-9]+}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %{{[a-z0-9]+}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! FIR:  func.func private @_workshare_copy_i32(%{{[a-z0-9]+}}: !fir.ref<i32>, %{{[a-z0-9]+}}: !fir.ref<i32>
 
 ! FIR:  func.func @_QPsb1
 ! FIR:    omp.parallel {
-! FIR:      omp.single copyprivate(%{{[a-z0-9]+}} -> @_workshare_copy_i32 : !fir.ref<i32>, %{{[a-z0-9]+}} -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>) {
+! FIR:      omp.single copyprivate(%{{[a-z0-9]+}} -> @_workshare_copy_i32 : !fir.ref<i32>, %{{[a-z0-9]+}} -> @_workshare_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! FIR:        fir.allocmem
-! FIR:      omp.wsloop {
+! FIR:      omp.wsloop nowait {
 ! FIR:        omp.loop_nest
-! FIR:      omp.single nowait {
-! FIR:        fir.if
-! FIR:          fir.freemem
-! FIR:        fir.freemem
-! FIR:        omp.terminator
-! FIR:      }
 ! FIR:      omp.barrier
 ! FIR:      omp.terminator
 ! FIR:    }

>From 09914c074775c49ccf606e74a44f256e8eb48297 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 19 May 2026 07:34:06 -0700
Subject: [PATCH 7/8] fix test regressions from SeparateAllocatableAssign
 pipeline addition

Update basic-program.fir pipeline checks and
parallel-private-reduction-worstcase.f90 LLVM IR label numbers
to account for the new SeparateAllocatableAssign pass.
---
 flang/test/Fir/basic-program.fir              |  5 +
 .../parallel-private-reduction-worstcase.f90  | 98 +++++++++----------
 2 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 1e26b388267b6..58bc2d3d56ae3 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -21,18 +21,23 @@ func.func @_QQmain() {
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   SeparateAllocatableAssign
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   SeparateAllocatableAssign
 // PASSES-NEXT:  'omp.declare_mapper' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   SeparateAllocatableAssign
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   SeparateAllocatableAssign
 // PASSES-NEXT: 'omp.private' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   SeparateAllocatableAssign
 // PASSES-NEXT:   Canonicalizer
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd
diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index c6a46691d58f5..2c0de1c833cfb 100644
--- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -83,72 +83,72 @@ subroutine worst_case(a, b, c, d)
 ! CHECK:       omp.private.copy12:                               ; preds = %omp.private.copy
 !                [begin firstprivate copy for first var]
 !                [read the length, is it non-zero?]
-! CHECK:         br i1 %{{.*}}, label %omp.private.copy13, label %omp.private.copy14
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy13, label %omp.private.copy22
 
-! CHECK:       omp.private.copy14:                               ; preds = %omp.private.copy13, %omp.private.copy12
+! CHECK:       omp.private.copy22:                               ; preds = %omp.private.copy21, %omp.private.copy12
 ! CHECK-NEXT:    br label %omp.region.cont11
 
-! CHECK:       omp.region.cont11:                                 ; preds = %omp.private.copy14
+! CHECK:       omp.region.cont11:                                ; preds = %omp.private.copy22
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.private.copy16
+! CHECK-NEXT:    br label %omp.private.copy24
 
-! CHECK:       omp.private.copy16:                               ; preds = %omp.region.cont11
+! CHECK:       omp.private.copy24:                               ; preds = %omp.region.cont11
 !                [begin firstprivate copy for second var]
 !                [read the length, is it non-zero?]
-! CHECK:         br i1 %{{.*}}, label %omp.private.copy17, label %omp.private.copy18
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy25, label %omp.private.copy34
 
-! CHECK:       omp.private.copy18:                               ; preds = %omp.private.copy17, %omp.private.copy16
-! CHECK-NEXT:    br label %omp.region.cont15
+! CHECK:       omp.private.copy34:                               ; preds = %omp.private.copy33, %omp.private.copy24
+! CHECK-NEXT:    br label %omp.region.cont23
 
-! CHECK:       omp.region.cont15:                                ; preds = %omp.private.copy18
+! CHECK:       omp.region.cont23:                                ; preds = %omp.private.copy34
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    br label %omp.reduction.init
 
-! CHECK:       omp.reduction.init:                               ; preds = %omp.region.cont15
+! CHECK:       omp.reduction.init:                               ; preds = %omp.region.cont23
 !                [deffered stores for results of reduction alloc regions]
 ! CHECK:         br label %[[VAL_96:.*]]
 
 ! CHECK:       omp.reduction.neutral:                            ; preds = %omp.reduction.init
 !                [start of reduction initialization region]
 !                [null check:]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral20, label %omp.reduction.neutral21
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral36, label %omp.reduction.neutral37
 
-! CHECK:       omp.reduction.neutral21:                          ; preds = %omp.reduction.neutral
+! CHECK:       omp.reduction.neutral37:                          ; preds = %omp.reduction.neutral
 !                [malloc and assign the default value to the reduction variable]
-! CHECK:         br label %omp.reduction.neutral22
+! CHECK:         br label %omp.reduction.neutral38
 
-! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.reduction.neutral20, %omp.reduction.neutral21
-! CHECK-NEXT:    br label %omp.region.cont19
+! CHECK:       omp.reduction.neutral38:                          ; preds = %omp.reduction.neutral36, %omp.reduction.neutral37
+! CHECK-NEXT:    br label %omp.region.cont35
 
-! CHECK:       omp.region.cont19:                                ; preds = %omp.reduction.neutral22
+! CHECK:       omp.region.cont35:                                ; preds = %omp.reduction.neutral38
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.reduction.neutral24
+! CHECK-NEXT:    br label %omp.reduction.neutral40
 
-! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.region.cont19
+! CHECK:       omp.reduction.neutral40:                          ; preds = %omp.region.cont35
 !                [start of reduction initialization region]
 !                [null check:]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral25, label %omp.reduction.neutral26
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral41, label %omp.reduction.neutral42
 
-! CHECK:       omp.reduction.neutral26:                          ; preds = %omp.reduction.neutral24
+! CHECK:       omp.reduction.neutral42:                          ; preds = %omp.reduction.neutral40
 !                [malloc and assign the default value to the reduction variable]
-! CHECK:         br label %omp.reduction.neutral27
+! CHECK:         br label %omp.reduction.neutral43
 
-! CHECK:       omp.reduction.neutral27:                          ; preds = %omp.reduction.neutral25, %omp.reduction.neutral26
-! CHECK-NEXT:    br label %omp.region.cont23
+! CHECK:       omp.reduction.neutral43:                          ; preds = %omp.reduction.neutral41, %omp.reduction.neutral42
+! CHECK-NEXT:    br label %omp.region.cont39
 
-! CHECK:       omp.region.cont23:                                ; preds = %omp.reduction.neutral27
+! CHECK:       omp.region.cont39:                                ; preds = %omp.reduction.neutral43
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.par.region29
+! CHECK-NEXT:    br label %omp.par.region45
 
-! CHECK:       omp.par.region29:                                 ; preds = %omp.region.cont23
+! CHECK:       omp.par.region45:                                 ; preds = %omp.region.cont39
 !                [call SUM runtime function]
 !                [if (sum(a) == 1)]
-! CHECK:         br i1 %{{.*}}, label %omp.par.region30, label %omp.par.region31
+! CHECK:         br i1 %{{.*}}, label %omp.par.region46, label %omp.par.region47
 
-! CHECK:       omp.par.region31:                                 ; preds = %omp.par.region29
-! CHECK-NEXT:    br label %omp.region.cont28
+! CHECK:       omp.par.region47:                                 ; preds = %omp.par.region45
+! CHECK-NEXT:    br label %omp.region.cont44
 
-! CHECK:       omp.region.cont28:                                ; preds = %omp.par.region30, %omp.par.region31
+! CHECK:       omp.region.cont44:                                ; preds = %omp.par.region46, %omp.par.region47
 !                [omp parallel region done, call into the runtime to complete reduction]
 ! CHECK:         %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
 ! CHECK:         switch i32 %[[VAL_233]], label %reduce.finalize [
@@ -156,16 +156,16 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:      i32 2, label %reduce.switch.atomic
 ! CHECK-NEXT:    ]
 
-! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont28
+! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont44
 ! CHECK-NEXT:    unreachable
 
-! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont28
+! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont44
 ! CHECK-NEXT:    %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
 ! CHECK-NEXT:    br label %omp.reduction.nonatomic.body
 
 !              [various blocks implementing the reduction]
 
-! CHECK:       omp.region.cont36:                                ; preds =
+! CHECK:       omp.region.cont52:                                ; preds =
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    call void @__kmpc_end_reduce(
 ! CHECK-NEXT:    br label %reduce.finalize
@@ -182,37 +182,37 @@ subroutine worst_case(a, b, c, d)
 
 ! CHECK:       omp.reduction.cleanup:                            ; preds = %.fini
 !                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup42, label %omp.reduction.cleanup43
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup58, label %omp.reduction.cleanup59
 
-! CHECK:       omp.reduction.cleanup43:                          ; preds = %omp.reduction.cleanup42, %omp.reduction.cleanup
-! CHECK-NEXT:    br label %omp.region.cont41
+! CHECK:       omp.reduction.cleanup59:                          ; preds = %omp.reduction.cleanup58, %omp.reduction.cleanup
+! CHECK-NEXT:    br label %omp.region.cont57
 
-! CHECK:       omp.region.cont41:                                ; preds = %omp.reduction.cleanup43
+! CHECK:       omp.region.cont57:                                ; preds = %omp.reduction.cleanup59
 ! CHECK-NEXT:    %{{.*}} = load ptr, ptr
-! CHECK-NEXT:    br label %omp.reduction.cleanup45
+! CHECK-NEXT:    br label %omp.reduction.cleanup61
 
-! CHECK:       omp.reduction.cleanup45:                          ; preds = %omp.region.cont41
+! CHECK:       omp.reduction.cleanup61:                          ; preds = %omp.region.cont57
 !                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup46, label %omp.reduction.cleanup47
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup62, label %omp.reduction.cleanup63
 
-! CHECK:       omp.par.region30:                                 ; preds = %omp.par.region29
+! CHECK:       omp.par.region46:                                 ; preds = %omp.par.region45
 ! CHECK-NEXT:    call void @_FortranAStopStatement
 
-! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral24
+! CHECK:       omp.reduction.neutral41:                          ; preds = %omp.reduction.neutral40
 !                [source length was zero: finish initializing array]
-! CHECK:         br label %omp.reduction.neutral27
+! CHECK:         br label %omp.reduction.neutral43
 
-! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral
+! CHECK:       omp.reduction.neutral36:                          ; preds = %omp.reduction.neutral
 !                [source length was zero: finish initializing array]
-! CHECK:         br label %omp.reduction.neutral22
+! CHECK:         br label %omp.reduction.neutral38
 
-! CHECK:       omp.private.copy17:                               ; preds = %omp.private.copy16
+! CHECK:       omp.private.copy25:                               ; preds = %omp.private.copy24
 !                [source length was non-zero: call assign runtime]
-! CHECK:         br label %omp.private.copy18
+! CHECK:         br label %omp.private.copy34
 
 ! CHECK:       omp.private.copy13:                               ; preds = %omp.private.copy12
 !                [source length was non-zero: call assign runtime]
-! CHECK:         br label %omp.private.copy14
+! CHECK:         br label %omp.private.copy22
 
 ! CHECK:       omp.private.init8:                               ; preds = %omp.private.init7
 !                [var extent was non-zero: malloc a private array]
@@ -222,5 +222,5 @@ subroutine worst_case(a, b, c, d)
 !                [var extent was non-zero: malloc a private array]
 ! CHECK:         br label %omp.private.init5
 
-! CHECK:       omp.par.exit.exitStub:                           ; preds = %omp.region.cont51
+! CHECK:       omp.par.exit.exitStub:                           ; preds = %omp.region.cont67
 ! CHECK-NEXT:    ret void

>From d51606c121f8c209c2f3544aa07e42c4db41df16 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 19 May 2026 08:40:40 -0700
Subject: [PATCH 8/8] address review: fix lower bounds, add alias guard, fix
 header nit

- Propagate RHS lower bounds to finalizeRealloc (F2018 10.2.1.3) in
  both SeparateAllocatableAssign and InlineHLFIRAssign.
- Add use-def chain walk in SeparateAllocatableAssign to skip the
  transformation when RHS derives from LHS (e.g. a = a(:n)), avoiding
  use-after-free from realloc freeing the old storage.
- Fix file header formatting nit.
- Add test cases for self-aliasing and lower bounds propagation.
---
 .../HLFIR/Transforms/InlineHLFIRAssign.cpp    | 12 ++++-
 .../Transforms/SeparateAllocatableAssign.cpp  | 45 +++++++++++++++---
 .../HLFIR/separate-allocatable-assign.fir     | 47 +++++++++++++++++++
 3 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 2b081c0aa0568..94a55e7016997 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -192,6 +192,16 @@ static void genAllocatableInlineAssign(
   llvm::SmallVector<mlir::Value> rhsExtents =
       hlfir::getIndexExtents(loc, builder, rhsShape);
 
+  // F2018 10.2.1.3: when the LHS is (re-)allocated, its lower bounds
+  // come from LBOUND(rhs).  For variable RHS, extract the actual lower
+  // bounds from the entity; for hlfir.expr RHS, LBOUND is always 1.
+  llvm::SmallVector<mlir::Value> rhsLbounds;
+  if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
+    auto bounds = hlfir::genBounds(loc, builder, rhs);
+    for (auto &[lb, ub] : bounds)
+      rhsLbounds.push_back(lb);
+  }
+
   fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
                                   /*mutableProperties=*/{});
 
@@ -209,7 +219,7 @@ static void genAllocatableInlineAssign(
   fir::factory::MutableBoxReallocation realloc =
       fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
                                        lenParams, storageHandler);
-  fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+  fir::factory::finalizeRealloc(builder, loc, mutableBox, rhsLbounds,
                                 /*takeLboundsIfRealloc=*/true, realloc);
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
index 4a27b30863ac8..548fb2f2c7583 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
@@ -1,5 +1,4 @@
-//===- SeparateAllocatableAssign.cpp - Split realloc from assign
-//-----------===//
+//===- SeparateAllocatableAssign.cpp - Split realloc from assign ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -76,6 +75,33 @@ class SeparateAllocatableAssignConversion
     if (!fir::isBoxAddress(lhs.getType()))
       return rewriter.notifyMatchFailure(assign, "LHS is not a box address");
 
+    // Reallocation frees the old LHS storage. If RHS references that same
+    // storage (e.g. a = a(:n)), the freed pointer would be read by the
+    // subsequent non-realloc assign, causing use-after-free.  Walk the
+    // RHS use-def chain; if any operand leads back to the LHS value,
+    // the two may alias and we must keep the original realloc assign.
+    if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
+      llvm::SmallVector<mlir::Value> worklist = {rhs};
+      llvm::DenseSet<mlir::Value> visited;
+      bool mayAlias = false;
+      while (!worklist.empty()) {
+        mlir::Value v = worklist.pop_back_val();
+        if (!visited.insert(v).second)
+          continue;
+        if (v == lhs) {
+          mayAlias = true;
+          break;
+        }
+        auto *defOp = v.getDefiningOp();
+        if (!defOp)
+          continue;
+        for (mlir::Value operand : defOp->getOperands())
+          worklist.push_back(operand);
+      }
+      if (mayAlias)
+        return rewriter.notifyMatchFailure(assign, "LHS and RHS may alias");
+    }
+
     LLVM_DEBUG(llvm::dbgs() << "SeparateAllocatableAssign: splitting realloc "
                                "from assign\n");
 
@@ -83,12 +109,20 @@ class SeparateAllocatableAssignConversion
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
     builder.setInsertionPoint(assign);
 
-    // Get the shape of the RHS for reallocation.
     mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
     llvm::SmallVector<mlir::Value> rhsExtents =
         hlfir::getIndexExtents(loc, builder, rhsShape);
 
-    // Perform conditional reallocation of the LHS.
+    // F2018 10.2.1.3: when the LHS is (re-)allocated, its lower bounds
+    // come from LBOUND(rhs).  For variable RHS, extract the actual lower
+    // bounds from the entity; for hlfir.expr RHS, LBOUND is always 1.
+    llvm::SmallVector<mlir::Value> rhsLbounds;
+    if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
+      auto bounds = hlfir::genBounds(loc, builder, rhs);
+      for (auto &[lb, ub] : bounds)
+        rhsLbounds.push_back(lb);
+    }
+
     fir::MutableBoxValue mutableBox(lhs.getFirBase(), /*lenParameters=*/{},
                                     /*mutableProperties=*/{});
 
@@ -97,10 +131,9 @@ class SeparateAllocatableAssignConversion
     fir::factory::MutableBoxReallocation realloc =
         fir::factory::genReallocIfNeeded(builder, loc, mutableBox, rhsExtents,
                                          lenParams, noopHandler);
-    fir::factory::finalizeRealloc(builder, loc, mutableBox, /*lbounds=*/{},
+    fir::factory::finalizeRealloc(builder, loc, mutableBox, rhsLbounds,
                                   /*takeLboundsIfRealloc=*/true, realloc);
 
-    // Load the (now properly allocated) LHS box and emit a non-realloc assign.
     mlir::Value lhsBox = fir::LoadOp::create(builder, loc, lhs.getFirBase());
     hlfir::AssignOp::create(builder, loc, rhs, lhsBox,
                             /*realloc=*/false,
diff --git a/flang/test/HLFIR/separate-allocatable-assign.fir b/flang/test/HLFIR/separate-allocatable-assign.fir
index 40f2d14b6e628..f5a78d6ce76cf 100644
--- a/flang/test/HLFIR/separate-allocatable-assign.fir
+++ b/flang/test/HLFIR/separate-allocatable-assign.fir
@@ -96,3 +96,50 @@ func.func @test_non_allocatable(%arg0: !fir.ref<!fir.array<10xf64>>, %arg1: !fir
 // Non-allocatable assign should pass through unchanged
 // CHECK: hlfir.assign %{{.*}} to %{{.*}} : !fir.ref<!fir.array<10xf64>>, !fir.ref<!fir.array<10xf64>>
 // CHECK-NOT: fir.if
+
+// Test: self-aliasing (a = a(:n)) should NOT be separated because realloc
+// would free the old LHS storage that the RHS still references.
+func.func @test_self_alias(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+
+  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+  %a_box = fir.load %a#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %dims:3 = fir.box_dims %a_box, %c0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
+
+  %shape = fir.shape %c5 : (index) -> !fir.shape<1>
+  %section = hlfir.designate %a_box (%c1:%c5:%c1) shape %shape : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
+
+  hlfir.assign %section to %a#0 realloc : !fir.box<!fir.array<5xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  return
+}
+
+// CHECK-LABEL: func.func @test_self_alias
+// Self-aliasing must keep the original realloc assign
+// CHECK: hlfir.assign %{{.*}} to %{{.*}} realloc
+
+// Test: lower bounds from RHS should be preserved during reallocation.
+// source(10:12) has lower bound 10; dest should get lower bound 10 after
+// dest = source.
+func.func @test_lower_bounds(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.array<3xi32>>) {
+  %c10 = arith.constant 10 : index
+  %c3 = arith.constant 3 : index
+  %shapeshift = fir.shape_shift %c10, %c3 : (index, index) -> !fir.shapeshift<1>
+
+  %dest:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEdest"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+  %source:2 = hlfir.declare %arg1(%shapeshift) {uniq_name = "_QFEsource"} : (!fir.ref<!fir.array<3xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+
+  hlfir.assign %source#0 to %dest#0 realloc : !fir.box<!fir.array<3xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  return
+}
+
+// CHECK-LABEL: func.func @test_lower_bounds
+// The realloc should be separated with lower bound 10 propagated.
+// CHECK-NOT: hlfir.assign{{.*}}realloc
+// CHECK: %[[C10:.*]] = arith.constant 10 : index
+// CHECK: fir.if
+// CHECK: fir.allocmem
+// Lower bound 10 should appear in the embox/store of the new allocation.
+// CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !fir.box<!fir.array<3xi32>>, !fir.box<!fir.heap<!fir.array<?xi32>>>



More information about the flang-commits mailing list