[flang-commits] [flang] [flang] optimize WHERE with identical and disjoint array sections (PR #180279)

Tue Feb 10 02:05:19 PST 2026

https://github.com/jeanPerier updated https://github.com/llvm/llvm-project/pull/180279

>From c47ea7fc614c9a1c02b4548184c5c1d2f869ed45 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Fri, 6 Feb 2026 09:11:06 -0800
Subject: [PATCH 1/4] [flang][NFC] Extract ArraySectionAnalyzer from
 OptimizedBufferization

Extract `ArraySectionAnalyzer` from `OptimizedBufferization` into a standalone
analysis utility so it can be reused by other passes (e.g., `ScheduleOrderedAssignments`).

This is an NFC change that moves the `ArraySectionAnalyzer` class and its helper
methods to `flang/Optimizer/Analysis/ArraySectionAnalyzer.h` and `.cpp`.

Also extracts the logic to detect if a designate is using the indices
of an elemental operation in storage order.
---
 .../Optimizer/Analysis/ArraySectionAnalyzer.h | 119 ++++++
 .../Analysis/ArraySectionAnalyzer.cpp         | 300 +++++++++++++++
 flang/lib/Optimizer/Analysis/CMakeLists.txt   |   1 +
 .../Transforms/OptimizedBufferization.cpp     | 362 +-----------------
 4 files changed, 428 insertions(+), 354 deletions(-)
 create mode 100644 flang/include/flang/Optimizer/Analysis/ArraySectionAnalyzer.h
 create mode 100644 flang/lib/Optimizer/Analysis/ArraySectionAnalyzer.cpp

diff --git a/flang/include/flang/Optimizer/Analysis/ArraySectionAnalyzer.h b/flang/include/flang/Optimizer/Analysis/ArraySectionAnalyzer.h
new file mode 100644
index 0000000000000..0a9ff13e30525
--- /dev/null
+++ b/flang/include/flang/Optimizer/Analysis/ArraySectionAnalyzer.h
@@ -0,0 +1,119 @@
+//===- ArraySectionAnalyzer.h - Analyze array sections --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_ANALYSIS_ARRAYSECTIONANALYZER_H
+#define FORTRAN_OPTIMIZER_ANALYSIS_ARRAYSECTIONANALYZER_H
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+class Operation;
+class Value;
+} // namespace mlir
+
+namespace hlfir {
+class ElementalOpInterface;
+class DesignateOp;
+} // namespace hlfir
+
+namespace fir {
+class ArraySectionAnalyzer {
+public:
+  // The result of the analyzis is one of the values below.
+  enum class SlicesOverlapKind {
+    // Slices overlap is unknown.
+    Unknown,
+    // Slices are definitely identical.
+    DefinitelyIdentical,
+    // Slices are definitely disjoint.
+    DefinitelyDisjoint,
+    // Slices may be either disjoint or identical,
+    // i.e. there is definitely no partial overlap.
+    EitherIdenticalOrDisjoint
+  };
+
+  // Analyzes two hlfir.designate results and returns the overlap kind.
+  // The callers may use this method when the alias analysis reports
+  // an alias of some kind, so that we can run Fortran specific analysis
+  // on the array slices to see if they are identical or disjoint.
+  // Note that the alias analysis are not able to give such an answer
+  // about the references.
+  static SlicesOverlapKind analyze(mlir::Value ref1, mlir::Value ref2);
+
+  static bool isDesignatingArrayInOrder(hlfir::DesignateOp designate,
+                                        hlfir::ElementalOpInterface elemental);
+
+private:
+  struct SectionDesc {
+    // An array section is described by <lb, ub, stride> tuple.
+    // If the designator's subscript is not a triple, then
+    // the section descriptor is constructed as <lb, nullptr, nullptr>.
+    mlir::Value lb, ub, stride;
+
+    SectionDesc(mlir::Value lb, mlir::Value ub, mlir::Value stride);
+
+    // Normalize the section descriptor:
+    //   1. If UB is nullptr, then it is set to LB.
+    //   2. If LB==UB, then stride does not matter,
+    //      so it is reset to nullptr.
+    //   3. If STRIDE==1, then it is reset to nullptr.
+    void normalize();
+
+    bool operator==(const SectionDesc &other) const;
+  };
+
+  // Given an operand_iterator over the indices operands,
+  // read the subscript values and return them as SectionDesc
+  // updating the iterator. If isTriplet is true,
+  // the subscript is a triplet, and the result is <lb, ub, stride>.
+  // Otherwise, the subscript is a scalar index, and the result
+  // is <index, nullptr, nullptr>.
+  static SectionDesc readSectionDesc(mlir::Operation::operand_iterator &it,
+                                     bool isTriplet);
+
+  // Return the ordered lower and upper bounds of the section.
+  // If stride is known to be non-negative, then the ordered
+  // bounds match the <lb, ub> of the descriptor.
+  // If stride is known to be negative, then the ordered
+  // bounds are <ub, lb> of the descriptor.
+  // If stride is unknown, we cannot deduce any order,
+  // so the result is <nullptr, nullptr>
+  static std::pair<mlir::Value, mlir::Value>
+  getOrderedBounds(const SectionDesc &desc);
+
+  // Given two array sections <lb1, ub1, stride1> and
+  // <lb2, ub2, stride2>, return true only if the sections
+  // are known to be disjoint.
+  //
+  // For example, for any positive constant C:
+  //   X:Y does not overlap with (Y+C):Z
+  //   X:Y does not overlap with Z:(X-C)
+  static bool areDisjointSections(const SectionDesc &desc1,
+                                  const SectionDesc &desc2);
+
+  // Given two array sections <lb1, ub1, stride1> and
+  // <lb2, ub2, stride2>, return true only if the sections
+  // are known to be identical.
+  //
+  // For example:
+  //   <x, x, stride>
+  //   <x, nullptr, nullptr>
+  //
+  // These sections are identical, from the point of which array
+  // elements are being addresses, even though the shape
+  // of the array slices might be different.
+  static bool areIdenticalSections(const SectionDesc &desc1,
+                                   const SectionDesc &desc2);
+
+  // Return true, if v1 is known to be less than v2.
+  static bool isLess(mlir::Value v1, mlir::Value v2);
+};
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_ANALYSIS_ARRAYSECTIONANALYZER_H
diff --git a/flang/lib/Optimizer/Analysis/ArraySectionAnalyzer.cpp b/flang/lib/Optimizer/Analysis/ArraySectionAnalyzer.cpp
new file mode 100644
index 0000000000000..f5ee298f0948c
--- /dev/null
+++ b/flang/lib/Optimizer/Analysis/ArraySectionAnalyzer.cpp
@@ -0,0 +1,300 @@
+//===- ArraySectionAnalyzer.cpp - Analyze array sections ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/ArraySectionAnalyzer.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "array-section-analyzer"
+
+using namespace fir;
+
+ArraySectionAnalyzer::SectionDesc::SectionDesc(mlir::Value lb, mlir::Value ub,
+                                               mlir::Value stride)
+    : lb(lb), ub(ub), stride(stride) {
+  assert(lb && "lower bound or index must be specified");
+  normalize();
+}
+
+void ArraySectionAnalyzer::SectionDesc::normalize() {
+  if (!ub)
+    ub = lb;
+  if (lb == ub)
+    stride = nullptr;
+  if (stride)
+    if (auto val = fir::getIntIfConstant(stride))
+      if (*val == 1)
+        stride = nullptr;
+}
+
+bool ArraySectionAnalyzer::SectionDesc::operator==(
+    const SectionDesc &other) const {
+  return lb == other.lb && ub == other.ub && stride == other.stride;
+}
+
+ArraySectionAnalyzer::SectionDesc
+ArraySectionAnalyzer::readSectionDesc(mlir::Operation::operand_iterator &it,
+                                      bool isTriplet) {
+  if (isTriplet)
+    return {*it++, *it++, *it++};
+  return {*it++, nullptr, nullptr};
+}
+
+std::pair<mlir::Value, mlir::Value>
+ArraySectionAnalyzer::getOrderedBounds(const SectionDesc &desc) {
+  mlir::Value stride = desc.stride;
+  // Null stride means stride=1.
+  if (!stride)
+    return {desc.lb, desc.ub};
+  // Reverse the bounds, if stride is negative.
+  if (auto val = fir::getIntIfConstant(stride)) {
+    if (*val >= 0)
+      return {desc.lb, desc.ub};
+    else
+      return {desc.ub, desc.lb};
+  }
+
+  return {nullptr, nullptr};
+}
+
+bool ArraySectionAnalyzer::areDisjointSections(const SectionDesc &desc1,
+                                               const SectionDesc &desc2) {
+  auto [lb1, ub1] = getOrderedBounds(desc1);
+  auto [lb2, ub2] = getOrderedBounds(desc2);
+  if (!lb1 || !lb2)
+    return false;
+  // Note that this comparison must be made on the ordered bounds,
+  // otherwise 'a(x:y:1) = a(z:x-1:-1) + 1' may be incorrectly treated
+  // as not overlapping (x=2, y=10, z=9).
+  if (isLess(ub1, lb2) || isLess(ub2, lb1))
+    return true;
+  return false;
+}
+
+bool ArraySectionAnalyzer::areIdenticalSections(const SectionDesc &desc1,
+                                                const SectionDesc &desc2) {
+  if (desc1 == desc2)
+    return true;
+  return false;
+}
+
+ArraySectionAnalyzer::SlicesOverlapKind
+ArraySectionAnalyzer::analyze(mlir::Value ref1, mlir::Value ref2) {
+  if (ref1 == ref2)
+    return SlicesOverlapKind::DefinitelyIdentical;
+
+  auto des1 = ref1.getDefiningOp<hlfir::DesignateOp>();
+  auto des2 = ref2.getDefiningOp<hlfir::DesignateOp>();
+  // We only support a pair of designators right now.
+  if (!des1 || !des2)
+    return SlicesOverlapKind::Unknown;
+
+  if (des1.getMemref() != des2.getMemref()) {
+    // If the bases are different, then there is unknown overlap.
+    LLVM_DEBUG(llvm::dbgs() << "No identical base for:\n"
+                            << des1 << "and:\n"
+                            << des2 << "\n");
+    return SlicesOverlapKind::Unknown;
+  }
+
+  // Require all components of the designators to be the same.
+  // It might be too strict, e.g. we may probably allow for
+  // different type parameters.
+  if (des1.getComponent() != des2.getComponent() ||
+      des1.getComponentShape() != des2.getComponentShape() ||
+      des1.getSubstring() != des2.getSubstring() ||
+      des1.getComplexPart() != des2.getComplexPart() ||
+      des1.getTypeparams() != des2.getTypeparams()) {
+    LLVM_DEBUG(llvm::dbgs() << "Different designator specs for:\n"
+                            << des1 << "and:\n"
+                            << des2 << "\n");
+    return SlicesOverlapKind::Unknown;
+  }
+
+  // Analyze the subscripts.
+  auto des1It = des1.getIndices().begin();
+  auto des2It = des2.getIndices().begin();
+  bool identicalTriplets = true;
+  bool identicalIndices = true;
+  for (auto [isTriplet1, isTriplet2] :
+       llvm::zip(des1.getIsTriplet(), des2.getIsTriplet())) {
+    SectionDesc desc1 = readSectionDesc(des1It, isTriplet1);
+    SectionDesc desc2 = readSectionDesc(des2It, isTriplet2);
+
+    // See if we can prove that any of the sections do not overlap.
+    // This is mostly a Polyhedron/nf performance hack that looks for
+    // particular relations between the lower and upper bounds
+    // of the array sections, e.g. for any positive constant C:
+    //   X:Y does not overlap with (Y+C):Z
+    //   X:Y does not overlap with Z:(X-C)
+    if (areDisjointSections(desc1, desc2))
+      return SlicesOverlapKind::DefinitelyDisjoint;
+
+    if (!areIdenticalSections(desc1, desc2)) {
+      if (isTriplet1 || isTriplet2) {
+        // For example:
+        //   hlfir.designate %6#0 (%c2:%c7999:%c1, %c1:%c120:%c1, %0)
+        //   hlfir.designate %6#0 (%c2:%c7999:%c1, %c1:%c120:%c1, %1)
+        //
+        // If all the triplets (section speficiers) are the same, then
+        // we do not care if %0 is equal to %1 - the slices are either
+        // identical or completely disjoint.
+        //
+        // Also, treat these as identical sections:
+        //   hlfir.designate %6#0 (%c2:%c2:%c1)
+        //   hlfir.designate %6#0 (%c2)
+        identicalTriplets = false;
+        LLVM_DEBUG(llvm::dbgs() << "Triplet mismatch for:\n"
+                                << des1 << "and:\n"
+                                << des2 << "\n");
+      } else {
+        identicalIndices = false;
+        LLVM_DEBUG(llvm::dbgs() << "Indices mismatch for:\n"
+                                << des1 << "and:\n"
+                                << des2 << "\n");
+      }
+    }
+  }
+
+  if (identicalTriplets) {
+    if (identicalIndices)
+      return SlicesOverlapKind::DefinitelyIdentical;
+    else
+      return SlicesOverlapKind::EitherIdenticalOrDisjoint;
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Different sections for:\n"
+                          << des1 << "and:\n"
+                          << des2 << "\n");
+  return SlicesOverlapKind::Unknown;
+}
+
+bool ArraySectionAnalyzer::isLess(mlir::Value v1, mlir::Value v2) {
+  auto removeConvert = [](mlir::Value v) -> mlir::Operation * {
+    auto *op = v.getDefiningOp();
+    while (auto conv = mlir::dyn_cast_or_null<fir::ConvertOp>(op))
+      op = conv.getValue().getDefiningOp();
+    return op;
+  };
+
+  auto isPositiveConstant = [](mlir::Value v) -> bool {
+    if (auto val = fir::getIntIfConstant(v))
+      return *val > 0;
+    return false;
+  };
+
+  auto *op1 = removeConvert(v1);
+  auto *op2 = removeConvert(v2);
+  if (!op1 || !op2)
+    return false;
+
+  // Check if they are both constants.
+  if (auto val1 = fir::getIntIfConstant(op1->getResult(0)))
+    if (auto val2 = fir::getIntIfConstant(op2->getResult(0)))
+      return *val1 < *val2;
+
+  // Handle some variable cases (C > 0):
+  //   v2 = v1 + C
+  //   v2 = C + v1
+  //   v1 = v2 - C
+  if (auto addi = mlir::dyn_cast<mlir::arith::AddIOp>(op2))
+    if ((addi.getLhs().getDefiningOp() == op1 &&
+         isPositiveConstant(addi.getRhs())) ||
+        (addi.getRhs().getDefiningOp() == op1 &&
+         isPositiveConstant(addi.getLhs())))
+      return true;
+  if (auto subi = mlir::dyn_cast<mlir::arith::SubIOp>(op1))
+    if (subi.getLhs().getDefiningOp() == op2 &&
+        isPositiveConstant(subi.getRhs()))
+      return true;
+  return false;
+}
+
+/// Returns the array indices for the given hlfir.designate.
+/// It recognizes the computations used to transform the one-based indices
+/// into the array's lb-based indices, and returns the one-based indices
+/// in these cases.
+static llvm::SmallVector<mlir::Value>
+getDesignatorIndices(hlfir::DesignateOp designate) {
+  mlir::Value memref = designate.getMemref();
+
+  // If the object is a box, then the indices may be adjusted
+  // according to the box's lower bound(s). Scan through
+  // the computations to try to find the one-based indices.
+  if (mlir::isa<fir::BaseBoxType>(memref.getType())) {
+    // Look for the following pattern:
+    //   %13 = fir.load %12 : !fir.ref<!fir.box<...>
+    //   %14:3 = fir.box_dims %13, %c0 : (!fir.box<...>, index) -> ...
+    //   %17 = arith.subi %14#0, %c1 : index
+    //   %18 = arith.addi %arg2, %17 : index
+    //   %19 = hlfir.designate %13 (%18)  : (!fir.box<...>, index) -> ...
+    //
+    // %arg2 is a one-based index.
+
+    auto isNormalizedLb = [memref](mlir::Value v, unsigned dim) {
+      // Return true, if v and dim are such that:
+      //   %14:3 = fir.box_dims %13, %dim : (!fir.box<...>, index) -> ...
+      //   %17 = arith.subi %14#0, %c1 : index
+      //   %19 = hlfir.designate %13 (...)  : (!fir.box<...>, index) -> ...
+      if (auto subOp =
+              mlir::dyn_cast_or_null<mlir::arith::SubIOp>(v.getDefiningOp())) {
+        auto cst = fir::getIntIfConstant(subOp.getRhs());
+        if (!cst || *cst != 1)
+          return false;
+        if (auto dimsOp = mlir::dyn_cast_or_null<fir::BoxDimsOp>(
+                subOp.getLhs().getDefiningOp())) {
+          if (memref != dimsOp.getVal() ||
+              dimsOp.getResult(0) != subOp.getLhs())
+            return false;
+          auto dimsOpDim = fir::getIntIfConstant(dimsOp.getDim());
+          return dimsOpDim && dimsOpDim == dim;
+        }
+      }
+      return false;
+    };
+
+    llvm::SmallVector<mlir::Value> newIndices;
+    for (auto index : llvm::enumerate(designate.getIndices())) {
+      if (auto addOp = mlir::dyn_cast_or_null<mlir::arith::AddIOp>(
+              index.value().getDefiningOp())) {
+        for (unsigned opNum = 0; opNum < 2; ++opNum)
+          if (isNormalizedLb(addOp->getOperand(opNum), index.index())) {
+            newIndices.push_back(addOp->getOperand((opNum + 1) % 2));
+            break;
+          }
+
+        // If new one-based index was not added, exit early.
+        if (newIndices.size() <= index.index())
+          break;
+      }
+    }
+
+    // If any of the indices is not adjusted to the array's lb,
+    // then return the original designator indices.
+    if (newIndices.size() != designate.getIndices().size())
+      return designate.getIndices();
+
+    return newIndices;
+  }
+
+  return designate.getIndices();
+}
+
+bool fir::ArraySectionAnalyzer::isDesignatingArrayInOrder(
+    hlfir::DesignateOp designate, hlfir::ElementalOpInterface elemental) {
+
+  auto indices = getDesignatorIndices(designate);
+  auto elementalIndices = elemental.getIndices();
+  if (indices.size() == elementalIndices.size())
+    return std::equal(indices.begin(), indices.end(), elementalIndices.begin(),
+                      elementalIndices.end());
+  return false;
+}
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index c890b969bae34..6a7a648665077 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_flang_library(FIRAnalysis
   AliasAnalysis.cpp
+  ArraySectionAnalyzer.cpp
   TBAAForest.cpp
 
   DEPENDS
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 5351a9a50954f..58891227965f4 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Analysis/ArraySectionAnalyzer.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
@@ -88,13 +89,6 @@ class ElementalAssignBufferization
   /// determines if the transformation can be applied to this elemental
   static std::optional<MatchInfo> findMatch(hlfir::ElementalOp elemental);
 
-  /// Returns the array indices for the given hlfir.designate.
-  /// It recognizes the computations used to transform the one-based indices
-  /// into the array's lb-based indices, and returns the one-based indices
-  /// in these cases.
-  static llvm::SmallVector<mlir::Value>
-  getDesignatorIndices(hlfir::DesignateOp designate);
-
 public:
   using mlir::OpRewritePattern<hlfir::ElementalOp>::OpRewritePattern;
 
@@ -167,344 +161,6 @@ containsReadOrWriteEffectOn(const mlir::MemoryEffects::EffectInstance &effect,
   return mlir::AliasResult::NoAlias;
 }
 
-// Helper class for analyzing two array slices represented
-// by two hlfir.designate operations.
-class ArraySectionAnalyzer {
-public:
-  // The result of the analyzis is one of the values below.
-  enum class SlicesOverlapKind {
-    // Slices overlap is unknown.
-    Unknown,
-    // Slices are definitely identical.
-    DefinitelyIdentical,
-    // Slices are definitely disjoint.
-    DefinitelyDisjoint,
-    // Slices may be either disjoint or identical,
-    // i.e. there is definitely no partial overlap.
-    EitherIdenticalOrDisjoint
-  };
-
-  // Analyzes two hlfir.designate results and returns the overlap kind.
-  // The callers may use this method when the alias analysis reports
-  // an alias of some kind, so that we can run Fortran specific analysis
-  // on the array slices to see if they are identical or disjoint.
-  // Note that the alias analysis are not able to give such an answer
-  // about the references.
-  static SlicesOverlapKind analyze(mlir::Value ref1, mlir::Value ref2);
-
-private:
-  struct SectionDesc {
-    // An array section is described by <lb, ub, stride> tuple.
-    // If the designator's subscript is not a triple, then
-    // the section descriptor is constructed as <lb, nullptr, nullptr>.
-    mlir::Value lb, ub, stride;
-
-    SectionDesc(mlir::Value lb, mlir::Value ub, mlir::Value stride)
-        : lb(lb), ub(ub), stride(stride) {
-      assert(lb && "lower bound or index must be specified");
-      normalize();
-    }
-
-    // Normalize the section descriptor:
-    //   1. If UB is nullptr, then it is set to LB.
-    //   2. If LB==UB, then stride does not matter,
-    //      so it is reset to nullptr.
-    //   3. If STRIDE==1, then it is reset to nullptr.
-    void normalize() {
-      if (!ub)
-        ub = lb;
-      if (lb == ub)
-        stride = nullptr;
-      if (stride)
-        if (auto val = fir::getIntIfConstant(stride))
-          if (*val == 1)
-            stride = nullptr;
-    }
-
-    bool operator==(const SectionDesc &other) const {
-      return lb == other.lb && ub == other.ub && stride == other.stride;
-    }
-  };
-
-  // Given an operand_iterator over the indices operands,
-  // read the subscript values and return them as SectionDesc
-  // updating the iterator. If isTriplet is true,
-  // the subscript is a triplet, and the result is <lb, ub, stride>.
-  // Otherwise, the subscript is a scalar index, and the result
-  // is <index, nullptr, nullptr>.
-  static SectionDesc readSectionDesc(mlir::Operation::operand_iterator &it,
-                                     bool isTriplet) {
-    if (isTriplet)
-      return {*it++, *it++, *it++};
-    return {*it++, nullptr, nullptr};
-  }
-
-  // Return the ordered lower and upper bounds of the section.
-  // If stride is known to be non-negative, then the ordered
-  // bounds match the <lb, ub> of the descriptor.
-  // If stride is known to be negative, then the ordered
-  // bounds are <ub, lb> of the descriptor.
-  // If stride is unknown, we cannot deduce any order,
-  // so the result is <nullptr, nullptr>
-  static std::pair<mlir::Value, mlir::Value>
-  getOrderedBounds(const SectionDesc &desc) {
-    mlir::Value stride = desc.stride;
-    // Null stride means stride=1.
-    if (!stride)
-      return {desc.lb, desc.ub};
-    // Reverse the bounds, if stride is negative.
-    if (auto val = fir::getIntIfConstant(stride)) {
-      if (*val >= 0)
-        return {desc.lb, desc.ub};
-      else
-        return {desc.ub, desc.lb};
-    }
-
-    return {nullptr, nullptr};
-  }
-
-  // Given two array sections <lb1, ub1, stride1> and
-  // <lb2, ub2, stride2>, return true only if the sections
-  // are known to be disjoint.
-  //
-  // For example, for any positive constant C:
-  //   X:Y does not overlap with (Y+C):Z
-  //   X:Y does not overlap with Z:(X-C)
-  static bool areDisjointSections(const SectionDesc &desc1,
-                                  const SectionDesc &desc2) {
-    auto [lb1, ub1] = getOrderedBounds(desc1);
-    auto [lb2, ub2] = getOrderedBounds(desc2);
-    if (!lb1 || !lb2)
-      return false;
-    // Note that this comparison must be made on the ordered bounds,
-    // otherwise 'a(x:y:1) = a(z:x-1:-1) + 1' may be incorrectly treated
-    // as not overlapping (x=2, y=10, z=9).
-    if (isLess(ub1, lb2) || isLess(ub2, lb1))
-      return true;
-    return false;
-  }
-
-  // Given two array sections <lb1, ub1, stride1> and
-  // <lb2, ub2, stride2>, return true only if the sections
-  // are known to be identical.
-  //
-  // For example:
-  //   <x, x, stride>
-  //   <x, nullptr, nullptr>
-  //
-  // These sections are identical, from the point of which array
-  // elements are being addresses, even though the shape
-  // of the array slices might be different.
-  static bool areIdenticalSections(const SectionDesc &desc1,
-                                   const SectionDesc &desc2) {
-    if (desc1 == desc2)
-      return true;
-    return false;
-  }
-
-  // Return true, if v1 is known to be less than v2.
-  static bool isLess(mlir::Value v1, mlir::Value v2);
-};
-
-ArraySectionAnalyzer::SlicesOverlapKind
-ArraySectionAnalyzer::analyze(mlir::Value ref1, mlir::Value ref2) {
-  if (ref1 == ref2)
-    return SlicesOverlapKind::DefinitelyIdentical;
-
-  auto des1 = ref1.getDefiningOp<hlfir::DesignateOp>();
-  auto des2 = ref2.getDefiningOp<hlfir::DesignateOp>();
-  // We only support a pair of designators right now.
-  if (!des1 || !des2)
-    return SlicesOverlapKind::Unknown;
-
-  if (des1.getMemref() != des2.getMemref()) {
-    // If the bases are different, then there is unknown overlap.
-    LLVM_DEBUG(llvm::dbgs() << "No identical base for:\n"
-                            << des1 << "and:\n"
-                            << des2 << "\n");
-    return SlicesOverlapKind::Unknown;
-  }
-
-  // Require all components of the designators to be the same.
-  // It might be too strict, e.g. we may probably allow for
-  // different type parameters.
-  if (des1.getComponent() != des2.getComponent() ||
-      des1.getComponentShape() != des2.getComponentShape() ||
-      des1.getSubstring() != des2.getSubstring() ||
-      des1.getComplexPart() != des2.getComplexPart() ||
-      des1.getTypeparams() != des2.getTypeparams()) {
-    LLVM_DEBUG(llvm::dbgs() << "Different designator specs for:\n"
-                            << des1 << "and:\n"
-                            << des2 << "\n");
-    return SlicesOverlapKind::Unknown;
-  }
-
-  // Analyze the subscripts.
-  auto des1It = des1.getIndices().begin();
-  auto des2It = des2.getIndices().begin();
-  bool identicalTriplets = true;
-  bool identicalIndices = true;
-  for (auto [isTriplet1, isTriplet2] :
-       llvm::zip(des1.getIsTriplet(), des2.getIsTriplet())) {
-    SectionDesc desc1 = readSectionDesc(des1It, isTriplet1);
-    SectionDesc desc2 = readSectionDesc(des2It, isTriplet2);
-
-    // See if we can prove that any of the sections do not overlap.
-    // This is mostly a Polyhedron/nf performance hack that looks for
-    // particular relations between the lower and upper bounds
-    // of the array sections, e.g. for any positive constant C:
-    //   X:Y does not overlap with (Y+C):Z
-    //   X:Y does not overlap with Z:(X-C)
-    if (areDisjointSections(desc1, desc2))
-      return SlicesOverlapKind::DefinitelyDisjoint;
-
-    if (!areIdenticalSections(desc1, desc2)) {
-      if (isTriplet1 || isTriplet2) {
-        // For example:
-        //   hlfir.designate %6#0 (%c2:%c7999:%c1, %c1:%c120:%c1, %0)
-        //   hlfir.designate %6#0 (%c2:%c7999:%c1, %c1:%c120:%c1, %1)
-        //
-        // If all the triplets (section speficiers) are the same, then
-        // we do not care if %0 is equal to %1 - the slices are either
-        // identical or completely disjoint.
-        //
-        // Also, treat these as identical sections:
-        //   hlfir.designate %6#0 (%c2:%c2:%c1)
-        //   hlfir.designate %6#0 (%c2)
-        identicalTriplets = false;
-        LLVM_DEBUG(llvm::dbgs() << "Triplet mismatch for:\n"
-                                << des1 << "and:\n"
-                                << des2 << "\n");
-      } else {
-        identicalIndices = false;
-        LLVM_DEBUG(llvm::dbgs() << "Indices mismatch for:\n"
-                                << des1 << "and:\n"
-                                << des2 << "\n");
-      }
-    }
-  }
-
-  if (identicalTriplets) {
-    if (identicalIndices)
-      return SlicesOverlapKind::DefinitelyIdentical;
-    else
-      return SlicesOverlapKind::EitherIdenticalOrDisjoint;
-  }
-
-  LLVM_DEBUG(llvm::dbgs() << "Different sections for:\n"
-                          << des1 << "and:\n"
-                          << des2 << "\n");
-  return SlicesOverlapKind::Unknown;
-}
-
-bool ArraySectionAnalyzer::isLess(mlir::Value v1, mlir::Value v2) {
-  auto removeConvert = [](mlir::Value v) -> mlir::Operation * {
-    auto *op = v.getDefiningOp();
-    while (auto conv = mlir::dyn_cast_or_null<fir::ConvertOp>(op))
-      op = conv.getValue().getDefiningOp();
-    return op;
-  };
-
-  auto isPositiveConstant = [](mlir::Value v) -> bool {
-    if (auto val = fir::getIntIfConstant(v))
-      return *val > 0;
-    return false;
-  };
-
-  auto *op1 = removeConvert(v1);
-  auto *op2 = removeConvert(v2);
-  if (!op1 || !op2)
-    return false;
-
-  // Check if they are both constants.
-  if (auto val1 = fir::getIntIfConstant(op1->getResult(0)))
-    if (auto val2 = fir::getIntIfConstant(op2->getResult(0)))
-      return *val1 < *val2;
-
-  // Handle some variable cases (C > 0):
-  //   v2 = v1 + C
-  //   v2 = C + v1
-  //   v1 = v2 - C
-  if (auto addi = mlir::dyn_cast<mlir::arith::AddIOp>(op2))
-    if ((addi.getLhs().getDefiningOp() == op1 &&
-         isPositiveConstant(addi.getRhs())) ||
-        (addi.getRhs().getDefiningOp() == op1 &&
-         isPositiveConstant(addi.getLhs())))
-      return true;
-  if (auto subi = mlir::dyn_cast<mlir::arith::SubIOp>(op1))
-    if (subi.getLhs().getDefiningOp() == op2 &&
-        isPositiveConstant(subi.getRhs()))
-      return true;
-  return false;
-}
-
-llvm::SmallVector<mlir::Value>
-ElementalAssignBufferization::getDesignatorIndices(
-    hlfir::DesignateOp designate) {
-  mlir::Value memref = designate.getMemref();
-
-  // If the object is a box, then the indices may be adjusted
-  // according to the box's lower bound(s). Scan through
-  // the computations to try to find the one-based indices.
-  if (mlir::isa<fir::BaseBoxType>(memref.getType())) {
-    // Look for the following pattern:
-    //   %13 = fir.load %12 : !fir.ref<!fir.box<...>
-    //   %14:3 = fir.box_dims %13, %c0 : (!fir.box<...>, index) -> ...
-    //   %17 = arith.subi %14#0, %c1 : index
-    //   %18 = arith.addi %arg2, %17 : index
-    //   %19 = hlfir.designate %13 (%18)  : (!fir.box<...>, index) -> ...
-    //
-    // %arg2 is a one-based index.
-
-    auto isNormalizedLb = [memref](mlir::Value v, unsigned dim) {
-      // Return true, if v and dim are such that:
-      //   %14:3 = fir.box_dims %13, %dim : (!fir.box<...>, index) -> ...
-      //   %17 = arith.subi %14#0, %c1 : index
-      //   %19 = hlfir.designate %13 (...)  : (!fir.box<...>, index) -> ...
-      if (auto subOp =
-              mlir::dyn_cast_or_null<mlir::arith::SubIOp>(v.getDefiningOp())) {
-        auto cst = fir::getIntIfConstant(subOp.getRhs());
-        if (!cst || *cst != 1)
-          return false;
-        if (auto dimsOp = mlir::dyn_cast_or_null<fir::BoxDimsOp>(
-                subOp.getLhs().getDefiningOp())) {
-          if (memref != dimsOp.getVal() ||
-              dimsOp.getResult(0) != subOp.getLhs())
-            return false;
-          auto dimsOpDim = fir::getIntIfConstant(dimsOp.getDim());
-          return dimsOpDim && dimsOpDim == dim;
-        }
-      }
-      return false;
-    };
-
-    llvm::SmallVector<mlir::Value> newIndices;
-    for (auto index : llvm::enumerate(designate.getIndices())) {
-      if (auto addOp = mlir::dyn_cast_or_null<mlir::arith::AddIOp>(
-              index.value().getDefiningOp())) {
-        for (unsigned opNum = 0; opNum < 2; ++opNum)
-          if (isNormalizedLb(addOp->getOperand(opNum), index.index())) {
-            newIndices.push_back(addOp->getOperand((opNum + 1) % 2));
-            break;
-          }
-
-        // If new one-based index was not added, exit early.
-        if (newIndices.size() <= index.index())
-          break;
-      }
-    }
-
-    // If any of the indices is not adjusted to the array's lb,
-    // then return the original designator indices.
-    if (newIndices.size() != designate.getIndices().size())
-      return designate.getIndices();
-
-    return newIndices;
-  }
-
-  return designate.getIndices();
-}
-
 std::optional<ElementalAssignBufferization::MatchInfo>
 ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) {
   mlir::Operation::user_range users = elemental->getUsers();
@@ -627,22 +283,20 @@ ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) {
     if (!res.isPartial()) {
       if (auto designate =
               effect.getValue().getDefiningOp<hlfir::DesignateOp>()) {
-        ArraySectionAnalyzer::SlicesOverlapKind overlap =
-            ArraySectionAnalyzer::analyze(match.array, designate.getMemref());
+        fir::ArraySectionAnalyzer::SlicesOverlapKind overlap =
+            fir::ArraySectionAnalyzer::analyze(match.array,
+                                               designate.getMemref());
         if (overlap ==
-            ArraySectionAnalyzer::SlicesOverlapKind::DefinitelyDisjoint)
+            fir::ArraySectionAnalyzer::SlicesOverlapKind::DefinitelyDisjoint)
           continue;
 
-        if (overlap == ArraySectionAnalyzer::SlicesOverlapKind::Unknown) {
+        if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::Unknown) {
           LLVM_DEBUG(llvm::dbgs() << "possible read conflict: " << designate
                                   << " at " << elemental.getLoc() << "\n");
           return std::nullopt;
         }
-        auto indices = getDesignatorIndices(designate);
-        auto elementalIndices = elemental.getIndices();
-        if (indices.size() == elementalIndices.size() &&
-            std::equal(indices.begin(), indices.end(), elementalIndices.begin(),
-                       elementalIndices.end()))
+        if (fir::ArraySectionAnalyzer::isDesignatingArrayInOrder(designate,
+                                                                 elemental))
           continue;
 
         LLVM_DEBUG(llvm::dbgs() << "possible read conflict: " << designate

>From 8af63f097da51b069cd24df075db0b46fc0bfc1e Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Fri, 6 Feb 2026 09:11:33 -0800
Subject: [PATCH 2/4] [flang] optimize WHERE with array sections

Improve `ScheduleOrderedAssignments` to avoid creating temporary storage for
masks in `WHERE` constructs when the mask modification is "aligned" with the
assignment (e.g., `where(a(i)>0) a(i)=...`).

- Identify "aligned" conflicts (identical array elements accessed in order)
  using the `ArraySectionAnalyzer` that is extracted from
  OptimizedBufferization.
- Defer saving regions with aligned conflicts, allowing fusion if possible.
- Implement retroactive saving: if a region was modified in a previous run
  (fused via aligned conflict) but is needed by a later split run, insert
  a `SaveEntity` action before the modifying run.
- Use `std::list` for the schedule to support stable iterators for run insertion.
- Update tests to verify fewer temporaries and correct retroactive saves.
- Update flang pipeline at O2 and more to try fusing assignments in
  WHERE/FORALL. This allows maximizing the chances that mask temps are not
  needed (because a mask variable cannot be reused in a later run/loop
  nest if it was modified even if all the accesses are in order, being in
  order only matter for accesses generated inside the same loop nest).

This fixes suboptimal code generation where temporaries were created unnecessarily
for common patterns like `where (x > 0) x = ...`.
---
 .../LowerHLFIROrderedAssignments.cpp          |  55 +-
 .../Transforms/ScheduleOrderedAssignments.cpp | 511 ++++++++++++++----
 .../Transforms/ScheduleOrderedAssignments.h   |  49 +-
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   3 +-
 .../order_assignments/inlined-stack-temp.fir  |   8 +-
 .../where-array-sections.f90                  |  90 +++
 .../where-fusing-scheduling.f90               |   3 +-
 .../order_assignments/where-scheduling.f90    |  58 +-
 8 files changed, 595 insertions(+), 182 deletions(-)
 create mode 100644 flang/test/HLFIR/order_assignments/where-array-sections.f90

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 71b4e91f0110d..a3fd19d95fbbc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -96,7 +96,7 @@ struct MaskedArrayExpr {
   /// hlfir.elemental_addr that form the elemental tree producing
   /// the expression value. hlfir.elemental that produce values
   /// used inside transformational operations are not part of this set.
-  llvm::SmallPtrSet<mlir::Operation *, 4> elementalParts{};
+  hlfir::ElementalTree elementalParts;
   /// Was generateNoneElementalPart called?
   bool noneElementalPartWasGenerated = false;
   /// Is this expression the mask expression of the outer where statement?
@@ -900,62 +900,11 @@ bool OrderedAssignmentRewriter::isRequiredInCurrentRun(
   return false;
 }
 
-/// Is the apply using all the elemental indices in order?
-static bool isInOrderApply(hlfir::ApplyOp apply,
-                           hlfir::ElementalOpInterface elemental) {
-  mlir::Region::BlockArgListType elementalIndices = elemental.getIndices();
-  if (elementalIndices.size() != apply.getIndices().size())
-    return false;
-  for (auto [elementalIdx, applyIdx] :
-       llvm::zip(elementalIndices, apply.getIndices()))
-    if (elementalIdx != applyIdx)
-      return false;
-  return true;
-}
-
-/// Gather the tree of hlfir::ElementalOpInterface use-def, if any, starting
-/// from \p elemental, which may be a nullptr.
-static void
-gatherElementalTree(hlfir::ElementalOpInterface elemental,
-                    llvm::SmallPtrSetImpl<mlir::Operation *> &elementalOps,
-                    bool isOutOfOrder) {
-  if (elemental) {
-    // Only inline an applied elemental that must be executed in order if the
-    // applying indices are in order. An hlfir::Elemental may have been created
-    // for a transformational like transpose, and Fortran 2018 standard
-    // section 10.2.3.2, point 10 imply that impure elemental sub-expression
-    // evaluations should not be masked if they are the arguments of
-    // transformational expressions.
-    if (isOutOfOrder && elemental.isOrdered())
-      return;
-    elementalOps.insert(elemental.getOperation());
-    for (mlir::Operation &op : elemental.getElementalRegion().getOps())
-      if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(op)) {
-        bool isUnorderedApply =
-            isOutOfOrder || !isInOrderApply(apply, elemental);
-        auto maybeElemental =
-            mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
-                apply.getExpr().getDefiningOp());
-        gatherElementalTree(maybeElemental, elementalOps, isUnorderedApply);
-      }
-  }
-}
-
 MaskedArrayExpr::MaskedArrayExpr(mlir::Location loc, mlir::Region &region,
                                  bool isOuterMaskExpr)
     : loc{loc}, region{region}, isOuterMaskExpr{isOuterMaskExpr} {
   mlir::Operation &terminator = region.back().back();
-  if (auto elementalAddr =
-          mlir::dyn_cast<hlfir::ElementalOpInterface>(terminator)) {
-    // Vector subscripted designator (hlfir.elemental_addr terminator).
-    gatherElementalTree(elementalAddr, elementalParts, /*isOutOfOrder=*/false);
-    return;
-  }
-  // Try if elemental expression.
-  mlir::Value entity = mlir::cast<hlfir::YieldOp>(terminator).getEntity();
-  auto maybeElemental = mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
-      entity.getDefiningOp());
-  gatherElementalTree(maybeElemental, elementalParts, /*isOutOfOrder=*/false);
+  elementalParts = hlfir::ElementalTree::buildElementalTree(terminator);
 }
 
 void MaskedArrayExpr::generateNoneElementalPart(fir::FirOpBuilder &builder,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
index 63a5803878a2d..c98d967969156 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -8,6 +8,7 @@
 
 #include "ScheduleOrderedAssignments.h"
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Analysis/ArraySectionAnalyzer.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
@@ -23,7 +24,13 @@
 /// Log RAW or WAW conflict.
 [[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
                                          mlir::Value writtenOrReadVarA,
-                                         mlir::Value writtenVarB);
+                                         mlir::Value writtenVarB,
+                                         bool isAligned = false);
+/// Log when a region must be retroactively saved.
+[[maybe_unused]] static void
+logRetroactiveSave(llvm::raw_ostream &os, mlir::Region &yieldRegion,
+                   hlfir::Run &modifyingRun,
+                   hlfir::RegionAssignOp currentAssign);
 /// Log when an expression evaluation must be saved.
 [[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
                                                unsigned runid,
@@ -39,15 +46,129 @@ logStartScheduling(llvm::raw_ostream &os,
                    hlfir::OrderedAssignmentTreeOpInterface root);
 /// Log op if effect value is not known.
 [[maybe_unused]] static void
-logIfUnkownEffectValue(llvm::raw_ostream &os,
-                       mlir::MemoryEffects::EffectInstance effect,
-                       mlir::Operation &op);
+logIfUnknownEffectValue(llvm::raw_ostream &os,
+                        mlir::MemoryEffects::EffectInstance effect,
+                        mlir::Operation &op);
 
 //===----------------------------------------------------------------------===//
 // Scheduling Implementation
 //===----------------------------------------------------------------------===//
 
+/// Is the apply using all the elemental indices in order?
+static bool isInOrderApply(hlfir::ApplyOp apply,
+                           hlfir::ElementalOpInterface elemental) {
+  mlir::Region::BlockArgListType elementalIndices = elemental.getIndices();
+  if (elementalIndices.size() != apply.getIndices().size())
+    return false;
+  for (auto [elementalIdx, applyIdx] :
+       llvm::zip(elementalIndices, apply.getIndices()))
+    if (elementalIdx != applyIdx)
+      return false;
+  return true;
+}
+
+hlfir::ElementalTree
+hlfir::ElementalTree::buildElementalTree(mlir::Operation &regionTerminator) {
+  ElementalTree tree;
+  if (auto elementalAddr =
+          mlir::dyn_cast<hlfir::ElementalOpInterface>(regionTerminator)) {
+    // Vector subscripted designator (hlfir.elemental_addr terminator).
+    tree.gatherElementalTree(elementalAddr, /*isAppliedInOrder=*/true);
+    return tree;
+  }
+  // Try if elemental expression.
+  if (auto yield = mlir::dyn_cast<hlfir::YieldOp>(regionTerminator)) {
+    mlir::Value entity = yield.getEntity();
+    if (auto maybeElemental =
+            mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
+                entity.getDefiningOp()))
+      tree.gatherElementalTree(maybeElemental, /*isAppliedInOrder=*/true);
+  }
+  return tree;
+}
+
+// Check if op is an ElementalOpInterface that is part of this elemental tree.
+bool hlfir::ElementalTree::contains(mlir::Operation *op) const {
+  for (auto &p : tree)
+    if (p.first == op)
+      return true;
+  return false;
+}
+
+std::optional<bool> hlfir::ElementalTree::isOrdered(mlir::Operation *op) const {
+  for (auto &p : tree)
+    if (p.first == op)
+      return p.second;
+  return std::nullopt;
+}
+
+void hlfir::ElementalTree::gatherElementalTree(
+    hlfir::ElementalOpInterface elemental, bool isAppliedInOrder) {
+  if (!elemental)
+    return;
+  // Only inline an applied elemental that must be executed in order if the
+  // applying indices are in order. An hlfir::Elemental may have been created
+  // for a transformational like transpose, and Fortran 2018 standard
+  // section 10.2.3.2, point 10 imply that impure elemental sub-expression
+  // evaluations should not be masked if they are the arguments of
+  // transformational expressions.
+  if (!isAppliedInOrder && elemental.isOrdered())
+    return;
+
+  insert(elemental, isAppliedInOrder);
+  for (mlir::Operation &op : elemental.getElementalRegion().getOps())
+    if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(op)) {
+      bool isUnorderedApply =
+          !isAppliedInOrder || !isInOrderApply(apply, elemental);
+      auto maybeElemental = mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
+          apply.getExpr().getDefiningOp());
+      gatherElementalTree(maybeElemental, !isUnorderedApply);
+    }
+}
+
+void hlfir::ElementalTree::insert(hlfir::ElementalOpInterface elementalOp,
+                                  bool isAppliedInOrder) {
+  tree.push_back({elementalOp.getOperation(), isAppliedInOrder});
+}
+
+static bool isInOrderDesignate(hlfir::DesignateOp designate,
+                               hlfir::ElementalTree *tree) {
+  if (!tree)
+    return false;
+  if (auto elemental =
+          designate->getParentOfType<hlfir::ElementalOpInterface>())
+    if (tree->isOrdered(elemental.getOperation()))
+      return fir::ArraySectionAnalyzer::isDesignatingArrayInOrder(designate,
+                                                                  elemental);
+  return false;
+}
+
+hlfir::DetailedEffectInstance::DetailedEffectInstance(
+    mlir::MemoryEffects::Effect *effect, mlir::OpOperand *value,
+    mlir::Value orderedElementalEffectOn)
+    : effectInstance(effect, value),
+      orderedElementalEffectOn(orderedElementalEffectOn) {}
+
+hlfir::DetailedEffectInstance::DetailedEffectInstance(
+    mlir::MemoryEffects::EffectInstance effectInst,
+    mlir::Value orderedElementalEffectOn)
+    : effectInstance(effectInst),
+      orderedElementalEffectOn(orderedElementalEffectOn) {}
+
+hlfir::DetailedEffectInstance
+hlfir::DetailedEffectInstance::getArrayReadEffect(mlir::OpOperand *array) {
+  return DetailedEffectInstance(mlir::MemoryEffects::Read::get(), array,
+                                array->get());
+}
+
+hlfir::DetailedEffectInstance
+hlfir::DetailedEffectInstance::getArrayWriteEffect(mlir::OpOperand *array) {
+  return DetailedEffectInstance(mlir::MemoryEffects::Write::get(), array,
+                                array->get());
+}
+
 namespace {
+
 /// Structure that is in charge of building the schedule. For each
 /// hlfir.region_assign inside an ordered assignment tree, it is walked through
 /// the parent operations and their "leaf" regions (that contain expression
@@ -99,20 +220,25 @@ class Scheduler {
 
   /// After all the dependent evaluation regions have been analyzed, create the
   /// action to evaluate the assignment that was being analyzed.
-  void finishSchedulingAssignment(hlfir::RegionAssignOp assign);
+  void finishSchedulingAssignment(hlfir::RegionAssignOp assign,
+                                  bool leafRegionsMayOnlyRead);
 
   /// Once all the assignments have been analyzed and scheduled, return the
   /// schedule. The scheduler object should not be used after this call.
   hlfir::Schedule moveSchedule() { return std::move(schedule); }
 
 private:
+  struct EvaluationState {
+    bool saved = false;
+    std::optional<hlfir::Schedule::iterator> modifiedInRun;
+  };
+
   /// Save a conflicting region that is evaluating an expression that is
   /// controlling or masking the current assignment, or is evaluating the
   /// RHS/LHS.
-  void
-  saveEvaluation(mlir::Region &yieldRegion,
-                 llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
-                 bool anyWrite);
+  void saveEvaluation(mlir::Region &yieldRegion,
+                      llvm::ArrayRef<hlfir::DetailedEffectInstance> effects,
+                      bool anyWrite);
 
   /// Can the current assignment be schedule with the previous run. This is
   /// only possible if the assignment and all of its dependencies have no side
@@ -120,19 +246,17 @@ class Scheduler {
   bool canFuseAssignmentWithPreviousRun();
 
   /// Memory effects of the assignments being lowered.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> assignEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> assignEffects;
   /// Memory effects of the evaluations implied by the assignments
   /// being lowered. They do not include the implicit writes
   /// to the LHS of the assignments.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> assignEvaluateEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> assignEvaluateEffects;
   /// Memory effects of the unsaved evaluation region that are controlling or
   /// masking the current assignments.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance>
-      parentEvaluationEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> parentEvaluationEffects;
   /// Same as parentEvaluationEffects, but for the current "leaf group" being
   /// analyzed scheduled.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance>
-      independentEvaluationEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> independentEvaluationEffects;
 
   /// Were any region saved for the current assignment?
   bool savedAnyRegionForCurrentAssignment = false;
@@ -140,7 +264,10 @@ class Scheduler {
   // Schedule being built.
   hlfir::Schedule schedule;
   /// Leaf regions that have been saved so far.
-  llvm::SmallPtrSet<mlir::Region *, 16> savedRegions;
+  llvm::DenseMap<mlir::Region *, EvaluationState> regionStates;
+  /// Regions that have an aligned conflict with the current assignment.
+  llvm::SmallVector<mlir::Region *> pendingAlignedRegions;
+
   /// Is schedule.back() a schedule that is only saving region with read
   /// effects?
   bool currentRunIsReadOnly = false;
@@ -171,9 +298,10 @@ static bool isForallIndex(mlir::Value var) {
 /// side effect interface, or that are writing temporary variables that may be
 /// hard to identify as such (one would have to prove the write is "local" to
 /// the region even when the alloca may be outside of the region).
-static void gatherMemoryEffects(
+static void gatherMemoryEffectsImpl(
     mlir::Region &region, bool mayOnlyRead,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &effects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &effects,
+    hlfir::ElementalTree *tree = nullptr) {
   /// This analysis is a simple walk of all the operations of the region that is
   /// evaluating and yielding a value. This is a lot simpler and safer than
   /// trying to walk back the SSA DAG from the yielded value. But if desired,
@@ -181,7 +309,7 @@ static void gatherMemoryEffects(
   for (mlir::Operation &op : region.getOps()) {
     if (op.hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>()) {
       for (mlir::Region &subRegion : op.getRegions())
-        gatherMemoryEffects(subRegion, mayOnlyRead, effects);
+        gatherMemoryEffectsImpl(subRegion, mayOnlyRead, effects, tree);
       // In MLIR, RecursiveMemoryEffects can be combined with
       // MemoryEffectOpInterface to describe extra effects on top of the
       // effects of the nested operations.  However, the presence of
@@ -214,17 +342,42 @@ static void gatherMemoryEffects(
     interface.getEffects(opEffects);
     for (auto &effect : opEffects)
       if (!isForallIndex(effect.getValue())) {
+        mlir::Value array;
+        if (effect.getValue())
+          if (auto designate =
+                  effect.getValue().getDefiningOp<hlfir::DesignateOp>())
+            if (isInOrderDesignate(designate, tree))
+              array = designate.getMemref();
+
         if (mlir::isa<mlir::MemoryEffects::Read>(effect.getEffect())) {
-          LLVM_DEBUG(logIfUnkownEffectValue(llvm::dbgs(), effect, op););
-          effects.push_back(effect);
+          LLVM_DEBUG(logIfUnknownEffectValue(llvm::dbgs(), effect, op););
+          effects.emplace_back(effect, array);
         } else if (!mayOnlyRead &&
                    mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect())) {
-          LLVM_DEBUG(logIfUnkownEffectValue(llvm::dbgs(), effect, op););
-          effects.push_back(effect);
+          LLVM_DEBUG(logIfUnknownEffectValue(llvm::dbgs(), effect, op););
+          effects.emplace_back(effect, array);
         }
       }
   }
 }
+static void gatherMemoryEffects(
+    mlir::Region &region, bool mayOnlyRead,
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &effects) {
+  if (!region.getParentOfType<hlfir::ForallOp>()) {
+    // TODO: While FORALL assignments may be array assignments, the iteration
+    // space is also driven by the FORALL indices, and it would take a bit more
+    // cups of coffee to prove that it is OK consider that "aligned" access
+    // conflicts can be ignored if they are evaluated inside the same loops.
+    // Besides, it would probably make sense to also deal with "aligned scalar"
+    // access for them like in "forall (i=1:10) x(i) = x(i) + 1".  For now this
+    // feature is disabled for inside FORALL.
+    hlfir::ElementalTree tree =
+        hlfir::ElementalTree::buildElementalTree(region.back().back());
+    gatherMemoryEffectsImpl(region, mayOnlyRead, effects, &tree);
+    return;
+  }
+  gatherMemoryEffectsImpl(region, mayOnlyRead, effects, /*tree=*/nullptr);
+}
 
 /// Return the entity yielded by a region, or a null value if the region
 /// is not terminated by a yield.
@@ -246,10 +399,14 @@ static mlir::OpOperand *getYieldedEntity(mlir::Region &region) {
 static void gatherAssignEffects(
     hlfir::RegionAssignOp regionAssign,
     bool userDefAssignmentMayOnlyWriteToAssignedVariable,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &assignEffects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &assignEffects) {
   mlir::OpOperand *assignedVar = getYieldedEntity(regionAssign.getLhsRegion());
   assert(assignedVar && "lhs cannot be an empty region");
-  assignEffects.emplace_back(mlir::MemoryEffects::Write::get(), assignedVar);
+  if (regionAssign->getParentOfType<hlfir::ForallOp>())
+    assignEffects.emplace_back(mlir::MemoryEffects::Write::get(), assignedVar);
+  else
+    assignEffects.emplace_back(
+        hlfir::DetailedEffectInstance::getArrayWriteEffect(assignedVar));
 
   if (!regionAssign.getUserDefinedAssignment().empty()) {
     // The write effect on the INTENT(OUT) LHS argument is already taken
@@ -273,7 +430,7 @@ static void gatherAssignEffects(
 static void gatherAssignEvaluationEffects(
     hlfir::RegionAssignOp regionAssign,
     bool userDefAssignmentMayOnlyWriteToAssignedVariable,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &assignEffects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &assignEffects) {
   gatherMemoryEffects(regionAssign.getLhsRegion(),
                       userDefAssignmentMayOnlyWriteToAssignedVariable,
                       assignEffects);
@@ -308,12 +465,42 @@ static mlir::Value getStorageSource(mlir::Value var) {
   return source;
 }
 
+namespace {
+struct ConflictKind {
+  enum Kind { None, Aligned, Any };
+  Kind kind;
+
+  ConflictKind(Kind k) : kind(k) {}
+
+  static ConflictKind none() { return ConflictKind(None); }
+  static ConflictKind aligned() { return ConflictKind(Aligned); }
+  static ConflictKind any() { return ConflictKind(Any); }
+
+  bool isNone() const { return kind == None; }
+  bool isAligned() const { return kind == Aligned; }
+  bool isAny() const { return kind == Any; }
+
+  // Merge conflicts:
+  // none || none -> none
+  // aligned || <not any> -> aligned
+  // any || _ -> any
+  ConflictKind operator||(const ConflictKind &other) const {
+    if (kind == Any || other.kind == Any)
+      return any();
+    if (kind == Aligned || other.kind == Aligned)
+      return aligned();
+    return none();
+  }
+};
+} // namespace
+
 /// Could there be any read or write in effectsA on a variable written to in
 /// effectsB?
-static bool
-anyRAWorWAW(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
-            llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsB,
+static ConflictKind
+anyRAWorWAW(llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsA,
+            llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsB,
             fir::AliasAnalysis &aliasAnalysis) {
+  ConflictKind result = ConflictKind::none();
   for (const auto &effectB : effectsB)
     if (mlir::isa<mlir::MemoryEffects::Write>(effectB.getEffect())) {
       mlir::Value writtenVarB = effectB.getValue();
@@ -325,38 +512,64 @@ anyRAWorWAW(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
           mlir::Value writtenOrReadVarA = effectA.getValue();
           if (!writtenVarB || !writtenOrReadVarA) {
             LLVM_DEBUG(
-                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB););
-            return true; // unknown conflict.
+                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB));
+            return ConflictKind::any(); // unknown conflict.
           }
           writtenOrReadVarA = getStorageSource(writtenOrReadVarA);
           if (!aliasAnalysis.alias(writtenOrReadVarA, writtenVarB).isNo()) {
+            mlir::Value arrayA = effectA.getOrderedElementalEffectOn();
+            mlir::Value arrayB = effectB.getOrderedElementalEffectOn();
+            if (arrayA && arrayB) {
+              if (arrayA == arrayB) {
+                result = result || ConflictKind::aligned();
+                LLVM_DEBUG(logConflict(llvm::dbgs(), writtenOrReadVarA,
+                                       writtenVarB, /*isAligned=*/true));
+                continue;
+              }
+              auto overlap = fir::ArraySectionAnalyzer::analyze(arrayA, arrayB);
+              if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 DefinitelyIdentical) {
+                result = result || ConflictKind::aligned();
+                LLVM_DEBUG(logConflict(llvm::dbgs(), writtenOrReadVarA,
+                                       writtenVarB, /*isAligned=*/true));
+                continue;
+              }
+              if (overlap !=
+                  fir::ArraySectionAnalyzer::SlicesOverlapKind::Unknown)
+                continue;
+              LLVM_DEBUG(llvm::dbgs() << "conflicting arrays:" << arrayA
+                                      << " and " << arrayB << "\n");
+              return ConflictKind::any();
+            }
             LLVM_DEBUG(
-                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB););
-            return true;
+                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB));
+            return ConflictKind::any();
           }
         }
     }
-  return false;
+  return result;
 }
 
 /// Could there be any read or write in effectsA on a variable written to in
 /// effectsB, or any read in effectsB on a variable written to in effectsA?
-static bool
-conflict(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
-         llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsB) {
+static ConflictKind
+conflict(llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsA,
+         llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsB) {
   fir::AliasAnalysis aliasAnalysis;
   // (RAW || WAW) || (WAR || WAW).
-  return anyRAWorWAW(effectsA, effectsB, aliasAnalysis) ||
-         anyRAWorWAW(effectsB, effectsA, aliasAnalysis);
+  ConflictKind result = anyRAWorWAW(effectsA, effectsB, aliasAnalysis);
+  if (result.isAny())
+    return result;
+  return result || anyRAWorWAW(effectsB, effectsA, aliasAnalysis);
 }
 
 /// Could there be any write effects in "effects" affecting memory storages
 /// that are not local to the current region.
 static bool
-anyNonLocalWrite(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
+anyNonLocalWrite(llvm::ArrayRef<hlfir::DetailedEffectInstance> effects,
                  mlir::Region &region) {
   return llvm::any_of(
-      effects, [&region](const mlir::MemoryEffects::EffectInstance &effect) {
+      effects, [&region](const hlfir::DetailedEffectInstance &effect) {
         if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect())) {
           if (mlir::Value v = effect.getValue()) {
             v = getStorageSource(v);
@@ -393,9 +606,9 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
   // If the region evaluation was previously executed and saved, the saved
   // value will be used when evaluating the current assignment and this has
   // no effects in the current assignment evaluation.
-  if (savedRegions.contains(&yieldRegion))
+  if (regionStates[&yieldRegion].saved)
     return;
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> effects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> effects;
   gatherMemoryEffects(yieldRegion, leafRegionsMayOnlyRead, effects);
   // Yield has no effect as such, but in the context of order assignments.
   // The order assignments will usually read the yielded entity (except for
@@ -404,8 +617,13 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
   // intent(inout)).
   if (yieldIsImplicitRead) {
     mlir::OpOperand *entity = getYieldedEntity(yieldRegion);
-    if (entity && hlfir::isFortranVariableType(entity->get().getType()))
-      effects.emplace_back(mlir::MemoryEffects::Read::get(), entity);
+    if (entity && hlfir::isFortranVariableType(entity->get().getType())) {
+      if (yieldRegion.getParentOfType<hlfir::ForallOp>())
+        effects.emplace_back(mlir::MemoryEffects::Read::get(), entity);
+      else
+        effects.emplace_back(
+            hlfir::DetailedEffectInstance::getArrayReadEffect(entity));
+    }
   }
   if (!leafRegionsMayOnlyRead && anyNonLocalWrite(effects, yieldRegion)) {
     // Region with write effect must be executed only once (unless all writes
@@ -415,33 +633,58 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
                    << "saving eval because write effect prevents re-evaluation"
                    << "\n";);
     saveEvaluation(yieldRegion, effects, /*anyWrite=*/true);
-  } else if (conflict(effects, assignEffects)) {
-    // Region that conflicts with the current assignments must be fully
-    // evaluated and saved before doing the assignment (Note that it may
-    // have already have been evaluated without saving it before, but this
-    // implies that it never conflicted with a prior assignment, so its value
-    // should be the same.)
-    saveEvaluation(yieldRegion, effects, /*anyWrite=*/false);
-  } else if (evaluationsMayConflict &&
-             conflict(effects, assignEvaluateEffects)) {
-    // If evaluations of the assignment may conflict with the yield
-    // evaluations, we have to save yield evaluation.
-    // For example, a WHERE mask might be written by the masked assignment
-    // evaluations, and it has to be saved in this case:
-    //   where (mask) r = f() ! function f modifies mask
-    saveEvaluation(yieldRegion, effects,
-                   anyNonLocalWrite(effects, yieldRegion));
   } else {
-    // Can be executed while doing the assignment.
-    independentEvaluationEffects.append(effects.begin(), effects.end());
+    ConflictKind conflictKind = conflict(effects, assignEffects);
+    if (conflictKind.isAny()) {
+      // Region that conflicts with the current assignments must be fully
+      // evaluated and saved before doing the assignment (Note that it may
+      // have already been evaluated without saving it before, but this
+      // implies that it never conflicted with a prior assignment, so its value
+      // should be the same.)
+      saveEvaluation(yieldRegion, effects, /*anyWrite=*/false);
+    } else {
+      if (conflictKind.isAligned())
+        pendingAlignedRegions.push_back(&yieldRegion);
+
+      if (evaluationsMayConflict &&
+          !conflict(effects, assignEvaluateEffects).isNone()) {
+        // If evaluations of the assignment may conflict with the yield
+        // evaluations, we have to save yield evaluation.
+        // For example, a WHERE mask might be written by the masked assignment
+        // evaluations, and it has to be saved in this case:
+        //   where (mask) r = f() ! function f modifies mask
+        saveEvaluation(yieldRegion, effects,
+                       anyNonLocalWrite(effects, yieldRegion));
+      } else {
+        // Can be executed while doing the assignment.
+        independentEvaluationEffects.append(effects.begin(), effects.end());
+      }
+    }
   }
 }
 
 void Scheduler::saveEvaluation(
     mlir::Region &yieldRegion,
-    llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
-    bool anyWrite) {
+    llvm::ArrayRef<hlfir::DetailedEffectInstance> effects, bool anyWrite) {
   savedAnyRegionForCurrentAssignment = true;
+  auto &state = regionStates[&yieldRegion];
+  if (state.modifiedInRun) {
+    // The region was modified in a previous run, but we now realize we need its
+    // value. We must save it before that modification run.
+    auto &newRun = *schedule.emplace(*state.modifiedInRun, hlfir::Run{});
+    newRun.actions.emplace_back(hlfir::SaveEntity{&yieldRegion});
+    // We do not have the parent effects from that time easily available here.
+    // However, since we are saving a parent of the current assignment, its
+    // parents are also parents of the current assignment.
+    newRun.memoryEffects.append(parentEvaluationEffects.begin(),
+                                parentEvaluationEffects.end());
+    newRun.memoryEffects.append(effects.begin(), effects.end());
+    state.saved = true;
+    LLVM_DEBUG(
+        logSaveEvaluation(llvm::dbgs(), /*runid=*/0, yieldRegion, anyWrite););
+    return;
+  }
+
   if (anyWrite) {
     // Create a new run just for regions with side effect. Further analysis
     // could try to prove the effects do not conflict with the previous
@@ -465,7 +708,7 @@ void Scheduler::saveEvaluation(
   schedule.back().memoryEffects.append(parentEvaluationEffects.begin(),
                                        parentEvaluationEffects.end());
   schedule.back().memoryEffects.append(effects.begin(), effects.end());
-  savedRegions.insert(&yieldRegion);
+  state.saved = true;
   LLVM_DEBUG(
       logSaveEvaluation(llvm::dbgs(), schedule.size(), yieldRegion, anyWrite););
 }
@@ -476,18 +719,78 @@ bool Scheduler::canFuseAssignmentWithPreviousRun() {
   if (savedAnyRegionForCurrentAssignment || schedule.empty())
     return false;
   auto &previousRunEffects = schedule.back().memoryEffects;
-  return !conflict(previousRunEffects, assignEffects) &&
-         !conflict(previousRunEffects, parentEvaluationEffects) &&
-         !conflict(previousRunEffects, independentEvaluationEffects);
+  return !conflict(previousRunEffects, assignEffects).isAny() &&
+         !conflict(previousRunEffects, parentEvaluationEffects).isAny() &&
+         !conflict(previousRunEffects, independentEvaluationEffects).isAny();
+}
+
+/// Gather the parents of (not included) \p node in reverse execution order.
+static void gatherParents(
+    hlfir::OrderedAssignmentTreeOpInterface node,
+    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
+  while (node) {
+    auto parent =
+        mlir::dyn_cast_or_null<hlfir::OrderedAssignmentTreeOpInterface>(
+            node->getParentOp());
+    if (parent && parent.getSubTreeRegion() == node->getParentRegion()) {
+      parents.push_back(parent);
+      node = parent;
+    } else {
+      break;
+    }
+  }
 }
 
-void Scheduler::finishSchedulingAssignment(hlfir::RegionAssignOp assign) {
-  // For now, always schedule each assignment in its own run. They could
-  // be done as part of previous assignment runs if it is proven they have
-  // no conflicting effects.
+// Build the list of the parent nodes for this assignment. The list is built
+// from the closest parent until the ordered assignment tree root (this is the
+// reverse of their execution order).
+static void gatherAssignmentParents(
+    hlfir::RegionAssignOp assign,
+    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
+  gatherParents(mlir::cast<hlfir::OrderedAssignmentTreeOpInterface>(
+                    assign.getOperation()),
+                parents);
+}
+
+void Scheduler::finishSchedulingAssignment(hlfir::RegionAssignOp assign,
+                                           bool leafRegionsMayOnlyRead) {
+  // Schedule the assignment in a new run, unless it can be fused with the
+  // previous run (if enabled and proven safe).
   currentRunIsReadOnly = false;
-  if (!tryFusingAssignments || !canFuseAssignmentWithPreviousRun())
+  bool fuse = tryFusingAssignments && canFuseAssignmentWithPreviousRun();
+  if (!fuse) {
+    // If we cannot fuse, we are about to start a new run.
+    // Check if any parent region was modified in a previous run and needs to be
+    // saved.
+    llvm::SmallVector<hlfir::OrderedAssignmentTreeOpInterface> parents;
+    gatherAssignmentParents(assign, parents);
+    for (auto parent : parents) {
+      llvm::SmallVector<mlir::Region *, 4> yieldRegions;
+      parent.getLeafRegions(yieldRegions);
+      for (mlir::Region *yieldRegion : yieldRegions) {
+        if (regionStates[yieldRegion].modifiedInRun &&
+            !regionStates[yieldRegion].saved) {
+          LLVM_DEBUG(logRetroactiveSave(
+              llvm::dbgs(), *yieldRegion,
+              **regionStates[yieldRegion].modifiedInRun, assign));
+          llvm::SmallVector<hlfir::DetailedEffectInstance> effects;
+          gatherMemoryEffects(*yieldRegion, leafRegionsMayOnlyRead, effects);
+          saveEvaluation(*yieldRegion, effects,
+                         anyNonLocalWrite(effects, *yieldRegion));
+        }
+      }
+    }
     schedule.emplace_back(hlfir::Run{});
+  }
+
+  // Mark pending aligned regions as modified in the current run (which is the
+  // last one).
+  auto runIt = std::prev(schedule.end());
+  for (mlir::Region *region : pendingAlignedRegions)
+    if (!regionStates[region].saved)
+      regionStates[region].modifiedInRun = runIt;
+  pendingAlignedRegions.clear();
+
   schedule.back().actions.emplace_back(assign);
   // TODO: when fusing, it would probably be best to filter the
   // parentEvaluationEffects that already in the previous run effects (since
@@ -530,34 +833,6 @@ gatherAssignments(hlfir::OrderedAssignmentTreeOpInterface root,
   }
 }
 
-/// Gather the parents of (not included) \p node in reverse execution order.
-static void gatherParents(
-    hlfir::OrderedAssignmentTreeOpInterface node,
-    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
-  while (node) {
-    auto parent =
-        mlir::dyn_cast_or_null<hlfir::OrderedAssignmentTreeOpInterface>(
-            node->getParentOp());
-    if (parent && parent.getSubTreeRegion() == node->getParentRegion()) {
-      parents.push_back(parent);
-      node = parent;
-    } else {
-      break;
-    }
-  }
-}
-
-// Build the list of the parent nodes for this assignment. The list is built
-// from the closest parent until the ordered assignment tree root (this is the
-// revere of their execution order).
-static void gatherAssignmentParents(
-    hlfir::RegionAssignOp assign,
-    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
-  gatherParents(mlir::cast<hlfir::OrderedAssignmentTreeOpInterface>(
-                    assign.getOperation()),
-                parents);
-}
-
 hlfir::Schedule
 hlfir::buildEvaluationSchedule(hlfir::OrderedAssignmentTreeOpInterface root,
                                bool tryFusingAssignments) {
@@ -616,7 +891,7 @@ hlfir::buildEvaluationSchedule(hlfir::OrderedAssignmentTreeOpInterface root,
                                          leafRegionsMayOnlyRead,
                                          /*yieldIsImplicitRead=*/false);
     scheduler.finishIndependentEvaluationGroup();
-    scheduler.finishSchedulingAssignment(assign);
+    scheduler.finishSchedulingAssignment(assign, leafRegionsMayOnlyRead);
   }
   return scheduler.moveSchedule();
 }
@@ -704,6 +979,25 @@ static llvm::raw_ostream &printRegionPath(llvm::raw_ostream &os,
   return printRegionId(os, yieldRegion);
 }
 
+[[maybe_unused]] static void
+logRetroactiveSave(llvm::raw_ostream &os, mlir::Region &yieldRegion,
+                   hlfir::Run &modifyingRun,
+                   hlfir::RegionAssignOp currentAssign) {
+  printRegionPath(os, yieldRegion) << " is modified in order by ";
+  bool first = true;
+  for (auto &action : modifyingRun.actions) {
+    if (auto *assign = std::get_if<hlfir::RegionAssignOp>(&action)) {
+      if (!first)
+        os << ", ";
+      printNodePath(os, assign->getOperation());
+      first = false;
+    }
+  }
+  os << " and is needed by ";
+  printNodePath(os, currentAssign.getOperation());
+  os << " that is scheduled in a later run\n";
+}
+
 [[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
                                                unsigned runid,
                                                mlir::Region &yieldRegion,
@@ -721,13 +1015,14 @@ logAssignmentEvaluation(llvm::raw_ostream &os, unsigned runid,
 
 [[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
                                          mlir::Value writtenOrReadVarA,
-                                         mlir::Value writtenVarB) {
+                                         mlir::Value writtenVarB,
+                                         bool isAligned) {
   auto printIfValue = [&](mlir::Value var) -> llvm::raw_ostream & {
     if (!var)
       return os << "<unknown>";
     return os << var;
   };
-  os << "conflict: R/W: ";
+  os << "conflict" << (isAligned ? " (aligned)" : "") << ": R/W: ";
   printIfValue(writtenOrReadVarA) << " W:";
   printIfValue(writtenVarB) << "\n";
 }
@@ -743,9 +1038,9 @@ logStartScheduling(llvm::raw_ostream &os,
 }
 
 [[maybe_unused]] static void
-logIfUnkownEffectValue(llvm::raw_ostream &os,
-                       mlir::MemoryEffects::EffectInstance effect,
-                       mlir::Operation &op) {
+logIfUnknownEffectValue(llvm::raw_ostream &os,
+                        mlir::MemoryEffects::EffectInstance effect,
+                        mlir::Operation &op) {
   if (effect.getValue() != nullptr)
     return;
   os << "unknown effected value (";
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
index 2ed242edc973a..2bf8a359fd227 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
@@ -15,9 +15,30 @@
 #define OPTIMIZER_HLFIR_TRANSFORM_SCHEDULEORDEREDASSIGNMENTS_H
 
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <list>
 
 namespace hlfir {
 
+struct ElementalTree {
+  // build an elemental tree given a masked region terminator.
+  static ElementalTree buildElementalTree(mlir::Operation &regionTerminator);
+  // Check if op is an ElementalOpInterface that is part of this elemental tree.
+  bool contains(mlir::Operation *op) const;
+
+  std::optional<bool> isOrdered(mlir::Operation *op) const;
+
+private:
+  void gatherElementalTree(hlfir::ElementalOpInterface elemental,
+                           bool isAppliedInOrder);
+  void insert(hlfir::ElementalOpInterface elementalOp, bool isAppliedInOrder);
+  // List of ElementalOpInterface operation forming this tree, as well as a
+  // Boolean to indicate if they are applied in order (that is, if there
+  // indexing space is the same as the one for the array yielded by the mask
+  // region that owns this tree).
+  llvm::SmallVector<std::pair<mlir::Operation *, bool>> tree;
+};
+
 /// Structure to represent that the value yielded by some region
 /// must be fully evaluated and saved for all index values at
 /// a given point of the ordered assignment tree evaluation.
@@ -29,6 +50,30 @@ struct SaveEntity {
   mlir::Value getSavedValue();
 };
 
+class DetailedEffectInstance {
+public:
+  DetailedEffectInstance(mlir::MemoryEffects::Effect *effect,
+                         mlir::OpOperand *value = nullptr,
+                         mlir::Value orderedElementalEffectOn = nullptr);
+  DetailedEffectInstance(mlir::MemoryEffects::EffectInstance effectInstance,
+                         mlir::Value orderedElementalEffectOn = nullptr);
+
+  static DetailedEffectInstance getArrayReadEffect(mlir::OpOperand *array);
+  static DetailedEffectInstance getArrayWriteEffect(mlir::OpOperand *array);
+
+  mlir::Value getValue() const { return effectInstance.getValue(); }
+  mlir::MemoryEffects::Effect *getEffect() const {
+    return effectInstance.getEffect();
+  }
+  mlir::Value getOrderedElementalEffectOn() const {
+    return orderedElementalEffectOn;
+  }
+
+private:
+  mlir::MemoryEffects::EffectInstance effectInstance;
+  mlir::Value orderedElementalEffectOn;
+};
+
 /// A run is a list of actions required to evaluate an ordered assignment tree
 /// that can be done in the same loop nest.
 /// The actions can evaluate and saves element values into temporary or evaluate
@@ -42,11 +87,11 @@ struct Run {
   /// the assignment part of an hlfir::RegionAssignOp.
   using Action = std::variant<hlfir::RegionAssignOp, SaveEntity>;
   llvm::SmallVector<Action> actions;
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> memoryEffects;
+  llvm::SmallVector<DetailedEffectInstance> memoryEffects;
 };
 
 /// List of runs to be executed in order to evaluate an order assignment tree.
-using Schedule = llvm::SmallVector<Run>;
+using Schedule = std::list<Run>;
 
 /// Example of schedules and run, and what they mean:
 ///  Fortran: forall (i=i:10) x(i) = y(i)
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6812347a8d39b..f15b0fe20bd9b 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -290,7 +290,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
           pm, hlfir::createInlineHLFIRCopyIn);
     }
   }
-  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
+  pm.addPass(hlfir::createLowerHLFIROrderedAssignments(
+      {/*tryFusingAssignments=*/optLevel.isOptimizingForSpeed()}));
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
 
   hlfir::BufferizeHLFIROptions bufferizeOptions;
diff --git a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
index 0724d019537c0..064b12b9ed812 100644
--- a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
+++ b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
@@ -198,13 +198,15 @@ func.func @mask_and_rhs_conflict(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
   %c0 = arith.constant 0 : index
   %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
   %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
   hlfir.where {
     %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
     %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
     %3 = hlfir.elemental %2 : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
     ^bb0(%arg1: index):
-      %4 = hlfir.designate %0#0 (%arg1)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      %add = arith.addi %arg1, %c1 : index
+      %4 = hlfir.designate %0#0 (%add)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
       %5 = fir.load %4 : !fir.ref<i32>
       %6 = arith.cmpi sgt, %5, %c42_i32 : i32
       %7 = fir.convert %6 : (i1) -> !fir.logical<4>
@@ -226,12 +228,14 @@ func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 // CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 // CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
 // CHECK:           ^bb0(%[[VAL_7:.*]]: index):
-// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_7]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[ADD:.*]] = arith.addi %[[VAL_7]], %[[C1]] : index
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[ADD]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
 // CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_2]] : i32
 // CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i1) -> !fir.logical<4>
diff --git a/flang/test/HLFIR/order_assignments/where-array-sections.f90 b/flang/test/HLFIR/order_assignments/where-array-sections.f90
new file mode 100644
index 0000000000000..2d6195abdcce8
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/where-array-sections.f90
@@ -0,0 +1,90 @@
+! Test scheduling of WHERE with aligned array sections.
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=false})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix NOFUSE
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=true})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix FUSE
+
+!REQUIRES: asserts
+
+subroutine no_temps(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,1) = var2(:,2)
+    var3(:) = var3(:) - var2(:,2)
+    var2(:,2) = 0.
+  end where
+end
+
+subroutine must_create_mask_temp_if_not_fused(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,2) = 0. ! -> modifies mask 1-1 
+    var2(:,1) = var2(:,2)
+    var3(:) = var3(:) - var2(:,2)
+  end where
+end
+
+subroutine must_split_and_create_temps(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,2) = 0. ! -> modifies mask 1-1
+    ! RHS/LHS overlap require saving RHS and splitting loops, which requires
+    ! also saving the mask before the assignment above.
+    var2(:,1) = var2(2,:) + var2(2,:)
+    var3(:) = var3(:) - var2(:,2)
+  end where
+end
+
+!NOFUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 3 evaluate: where/region_assign3
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign4
+!NOFUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: run 3 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign4
+!NOFUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
+!NOFUSE-NEXT: run 3 save    : where/region_assign3/rhs
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 5 evaluate: where/region_assign4
+
+!FUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign3
+!FUSE-NEXT: run 1 evaluate: where/region_assign4
+!FUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: run 1 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign4
+!FUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
+!FUSE-NEXT: run 2 save    : where/region_assign3/rhs
+!FUSE-NEXT: run 3 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: run 3 evaluate: where/region_assign4
diff --git a/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
index 08d4092b49aef..1de457f974508 100644
--- a/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
@@ -37,5 +37,6 @@ subroutine unfusable(x, y, mask)
 !FUSE-NEXT: run 1 evaluate: where/region_assign2
 !FUSE-LABEL: ------------ scheduling where in _QPunfusable ------------
 !FUSE-NEXT: run 1 evaluate: where/region_assign1
-!FUSE-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1
+!FUSE-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1
+!FUSE-NEXT: conflicting arrays:{{.*}} and {{.*}}
 !FUSE-NEXT: run 2 evaluate: where/region_assign2
diff --git a/flang/test/HLFIR/order_assignments/where-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-scheduling.f90
index 6feaba0d3389a..496789334b84e 100644
--- a/flang/test/HLFIR/order_assignments/where-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-scheduling.f90
@@ -127,12 +127,30 @@ end function f
   end where
 end subroutine
 
+subroutine where_construct_need_to_be_split_no_temps(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.)
+    x = y
+  elsewhere (x(ubound(x,1):1:-1, :).gt.0)
+    y = x
+  end where
+end subroutine
+
+subroutine where_construct_need_to_be_split_with_temps(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.)
+    x = y
+    y = 0.
+  elsewhere (x(ubound(x,1):1:-1, :).gt.0)
+    y = x
+  end where
+end subroutine
+
 !CHECK-LABEL: ------------ scheduling where in _QPno_conflict ------------
 !CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPfake_conflict ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPonly_once ------------
 !CHECK-NEXT: unknown effect: %11 = fir.call @_QPcall_me_only_once() fastmath<contract> : () -> !fir.array<10x!fir.logical<4>>
 !CHECK-NEXT: saving eval because write effect prevents re-evaluation
@@ -148,24 +166,22 @@ end function f
 !CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict ------------
 !CHECK-NEXT: run 1 evaluate: where/region_assign1
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
-!CHECK-NEXT: run 2 save    : where/mask
-!CHECK-NEXT: run 3 evaluate: where/elsewhere1/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict_2 ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
-!CHECK-NEXT: run 3 save    : where/elsewhere1/mask
-!CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: where/mask is modified in order by where/region_assign1 and is needed by where/elsewhere1/region_assign1 that is scheduled in a later run
+!CHECK-NEXT: run 0 save    : where/mask
+!CHECK-NEXT: run 3 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_1 ------------
 !CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_2 ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling forall in _QPwhere_in_forall_conflict ------------
 !CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
 !CHECK-NEXT: run 1 save    : forall/where1/mask
@@ -195,3 +211,15 @@ end function f
 !CHECK-NEXT: saving eval because write effect prevents re-evaluation
 !CHECK-NEXT: run 3 save  (w): where/elsewhere1/region_assign1/rhs
 !CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_need_to_be_split_no_temps ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_need_to_be_split_with_temps ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/region_assign2
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/elsewhere1/region_assign1 that is scheduled in a later run
+!CHECK-NEXT: run 0 save    : where/mask
+!CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1

>From 13cdecd5f1d2f318552a4c47737d951a773e5b54 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Mon, 9 Feb 2026 02:24:05 -0800
Subject: [PATCH 3/4] handle the EitherIdenticalOrDisjoint as assuming
 identical

---
 .../Transforms/ScheduleOrderedAssignments.cpp | 27 ++++++----
 .../where-array-sections.f90                  | 50 ++++++++++++++++---
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
index c98d967969156..29a1a3f3d1016 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -364,13 +364,16 @@ static void gatherMemoryEffects(
     mlir::Region &region, bool mayOnlyRead,
     llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &effects) {
   if (!region.getParentOfType<hlfir::ForallOp>()) {
-    // TODO: While FORALL assignments may be array assignments, the iteration
-    // space is also driven by the FORALL indices, and it would take a bit more
-    // cups of coffee to prove that it is OK consider that "aligned" access
-    // conflicts can be ignored if they are evaluated inside the same loops.
-    // Besides, it would probably make sense to also deal with "aligned scalar"
-    // access for them like in "forall (i=1:10) x(i) = x(i) + 1".  For now this
-    // feature is disabled for inside FORALL.
+    // TODO: leverage array access analysis for FORALL.
+    // While FORALL assignments can be array assignments, the iteration space
+    // is also driven by the FORALL indices, so the way ArraySectionAnalyzer
+    // results are used is not adequate for it.
+    // For instance "disjoint" array access cannot be ignored in:
+    // "forall (i=1:10) x(i+1,:) = x(i,:)".
+    // While identical access can probably also be accepted, this would deserve
+    // more thinking, it would probably make sense to also deal with "aligned
+    // scalar" access for them like in "forall (i=1:10) x(i) = x(i) + 1".  For
+    // now this feature is disabled for inside FORALL.
     hlfir::ElementalTree tree =
         hlfir::ElementalTree::buildElementalTree(region.back().back());
     gatherMemoryEffectsImpl(region, mayOnlyRead, effects, &tree);
@@ -528,15 +531,17 @@ anyRAWorWAW(llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsA,
               }
               auto overlap = fir::ArraySectionAnalyzer::analyze(arrayA, arrayB);
               if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
-                                 DefinitelyIdentical) {
+                                 DefinitelyDisjoint)
+                continue;
+              if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 DefinitelyIdentical ||
+                  overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 EitherIdenticalOrDisjoint) {
                 result = result || ConflictKind::aligned();
                 LLVM_DEBUG(logConflict(llvm::dbgs(), writtenOrReadVarA,
                                        writtenVarB, /*isAligned=*/true));
                 continue;
               }
-              if (overlap !=
-                  fir::ArraySectionAnalyzer::SlicesOverlapKind::Unknown)
-                continue;
               LLVM_DEBUG(llvm::dbgs() << "conflicting arrays:" << arrayA
                                       << " and " << arrayB << "\n");
               return ConflictKind::any();
diff --git a/flang/test/HLFIR/order_assignments/where-array-sections.f90 b/flang/test/HLFIR/order_assignments/where-array-sections.f90
index 2d6195abdcce8..aab264d6e105e 100644
--- a/flang/test/HLFIR/order_assignments/where-array-sections.f90
+++ b/flang/test/HLFIR/order_assignments/where-array-sections.f90
@@ -48,43 +48,81 @@ subroutine must_split_and_create_temps(var1, var2, var3)
 end
 
 !NOFUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 1 evaluate: where/region_assign1
 !NOFUSE-NEXT: run 2 evaluate: where/region_assign2
 !NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !NOFUSE-NEXT: run 3 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 4 evaluate: where/region_assign4
 !NOFUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 2 evaluate: where/region_assign2
-!NOFUSE-NEXT: run 3 evaluate: where/region_assign3
+!NOFUSE-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!NOFUSE-NEXT: run 0 save    : where/mask
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign3
 !NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-!NOFUSE-NEXT: run 4 evaluate: where/region_assign4
+!NOFUSE-NEXT: run 5 evaluate: where/region_assign4
 !NOFUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !NOFUSE-NEXT: run 2 evaluate: where/region_assign2
 !NOFUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
 !NOFUSE-NEXT: run 3 save    : where/region_assign3/rhs
-!NOFUSE-NEXT: run 4 evaluate: where/region_assign3
+!NOFUSE-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!NOFUSE-NEXT: run 0 save    : where/mask
+!NOFUSE-NEXT: run 5 evaluate: where/region_assign3
 !NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-!NOFUSE-NEXT: run 5 evaluate: where/region_assign4
+!NOFUSE-NEXT: run 6 evaluate: where/region_assign4
 
 !FUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign2
 !FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign4
 !FUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign3
 !FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign4
 !FUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 !FUSE-NEXT: run 1 evaluate: where/region_assign2
 !FUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
 !FUSE-NEXT: run 2 save    : where/region_assign3/rhs
-!FUSE-NEXT: run 3 evaluate: where/region_assign3
+!FUSE-NEXT: where/mask is modified in order by where/region_assign1, where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!FUSE-NEXT: run 0 save    : where/mask
+!FUSE-NEXT: run 4 evaluate: where/region_assign3
 !FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-!FUSE-NEXT: run 3 evaluate: where/region_assign4
+!FUSE-NEXT: run 4 evaluate: where/region_assign4

>From 1a287bd5e527a0056c7c430e0e64e6035614b2f8 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Tue, 10 Feb 2026 01:52:43 -0800
Subject: [PATCH 4/4] add and fix comments

---
 .../Transforms/ScheduleOrderedAssignments.cpp   | 17 ++++++++++++++++-
 .../Transforms/ScheduleOrderedAssignments.h     |  9 ++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
index 29a1a3f3d1016..6bc5317b25d7a 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -469,8 +469,23 @@ static mlir::Value getStorageSource(mlir::Value var) {
 }
 
 namespace {
+
+/// Class to represent conflicts between several accesses (effects) to a memory
+/// location (read after write, write after write).
 struct ConflictKind {
-  enum Kind { None, Aligned, Any };
+  enum Kind {
+    // None: The effects are not affecting the same memory location, or they are
+    // all reads.
+    None,
+    // Aligned: There are both read and write effects affecting the same memory
+    // location, but it is known that these effects are all accessing the memory
+    // location element by element in array order. This means the conflict does
+    // not introduce loop-carried dependencies.
+    Aligned,
+    // Any: There may be both read and write effects affecting the same memory
+    // in any way.
+    Any
+  };
   Kind kind;
 
   ConflictKind(Kind k) : kind(k) {}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
index 2bf8a359fd227..7f479ab166b15 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
@@ -33,7 +33,7 @@ struct ElementalTree {
                            bool isAppliedInOrder);
   void insert(hlfir::ElementalOpInterface elementalOp, bool isAppliedInOrder);
   // List of ElementalOpInterface operation forming this tree, as well as a
-  // Boolean to indicate if they are applied in order (that is, if there
+  // Boolean to indicate if they are applied in order (that is, if their
   // indexing space is the same as the one for the array yielded by the mask
   // region that owns this tree).
   llvm::SmallVector<std::pair<mlir::Operation *, bool>> tree;
@@ -50,6 +50,11 @@ struct SaveEntity {
   mlir::Value getSavedValue();
 };
 
+/// Wrapper class around mlir::MemoryEffects::EffectInstance that
+/// allows providing an extra array value that indicates that the
+/// effect is done element by element in array order (one element
+/// accessed at each iteration of the ordered assignment iteration
+/// space).
 class DetailedEffectInstance {
 public:
   DetailedEffectInstance(mlir::MemoryEffects::Effect *effect,
@@ -71,6 +76,8 @@ class DetailedEffectInstance {
 
 private:
   mlir::MemoryEffects::EffectInstance effectInstance;
+  // Array whose elements are affected in array order by the
+  // ordered assignment iterations. Null value otherwise.
   mlir::Value orderedElementalEffectOn;
 };