[flang-commits] [flang] 25f5e97 - [flang] optimize WHERE with identical and disjoint array sections (#180279)

via flang-commits flang-commits at lists.llvm.org
Tue Feb 10 02:35:21 PST 2026


Author: jeanPerier
Date: 2026-02-10T11:35:15+01:00
New Revision: 25f5e9732784a9feec563c99ffcaa2b30b00ec72

URL: https://github.com/llvm/llvm-project/commit/25f5e9732784a9feec563c99ffcaa2b30b00ec72
DIFF: https://github.com/llvm/llvm-project/commit/25f5e9732784a9feec563c99ffcaa2b30b00ec72.diff

LOG: [flang] optimize WHERE with identical and disjoint array sections (#180279)

Improve `ScheduleOrderedAssignments` to avoid creating temporary storage
for masks in `WHERE` constructs when the mask modification is "aligned"
with the assignment (e.g., `where(a(i)>0) a(i)=...`).

- Identify "aligned" conflicts (identical array elements accessed in
order) using the `ArraySectionAnalyzer` that is extracted from
OptimizedBufferization.
- Defer saving regions with aligned conflicts, allowing fusion if
possible.
- Implement retroactive saving: if a region was modified in a previous
run (fused via aligned conflict) but is needed by a later split run,
insert a `SaveEntity` action before the modifying run.
- Use `std::list` for the schedule to support stable iterators for run
insertion.
- Update tests to verify fewer temporaries and correct retroactive
saves.
- Update flang pipeline at O2 and more to try fusing assignments in
WHERE/FORALL. This allows maximizing the chances that mask temps are not
needed (because a mask variable cannot be reused in a later run/loop
nest if it was modified even if all the accesses are in order, being in
order only matter for accesses generated inside the same loop nest).

This fixes suboptimal code generation where temporaries were created
unnecessarily for common patterns like `where (x > 0) x = ...`.

Added: 
    flang/test/HLFIR/order_assignments/where-array-sections.f90

Modified: 
    flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
    flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
    flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
    flang/lib/Optimizer/Passes/Pipelines.cpp
    flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
    flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
    flang/test/HLFIR/order_assignments/where-scheduling.f90

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 71b4e91f0110d..a3fd19d95fbbc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -96,7 +96,7 @@ struct MaskedArrayExpr {
   /// hlfir.elemental_addr that form the elemental tree producing
   /// the expression value. hlfir.elemental that produce values
   /// used inside transformational operations are not part of this set.
-  llvm::SmallPtrSet<mlir::Operation *, 4> elementalParts{};
+  hlfir::ElementalTree elementalParts;
   /// Was generateNoneElementalPart called?
   bool noneElementalPartWasGenerated = false;
   /// Is this expression the mask expression of the outer where statement?
@@ -900,62 +900,11 @@ bool OrderedAssignmentRewriter::isRequiredInCurrentRun(
   return false;
 }
 
-/// Is the apply using all the elemental indices in order?
-static bool isInOrderApply(hlfir::ApplyOp apply,
-                           hlfir::ElementalOpInterface elemental) {
-  mlir::Region::BlockArgListType elementalIndices = elemental.getIndices();
-  if (elementalIndices.size() != apply.getIndices().size())
-    return false;
-  for (auto [elementalIdx, applyIdx] :
-       llvm::zip(elementalIndices, apply.getIndices()))
-    if (elementalIdx != applyIdx)
-      return false;
-  return true;
-}
-
-/// Gather the tree of hlfir::ElementalOpInterface use-def, if any, starting
-/// from \p elemental, which may be a nullptr.
-static void
-gatherElementalTree(hlfir::ElementalOpInterface elemental,
-                    llvm::SmallPtrSetImpl<mlir::Operation *> &elementalOps,
-                    bool isOutOfOrder) {
-  if (elemental) {
-    // Only inline an applied elemental that must be executed in order if the
-    // applying indices are in order. An hlfir::Elemental may have been created
-    // for a transformational like transpose, and Fortran 2018 standard
-    // section 10.2.3.2, point 10 imply that impure elemental sub-expression
-    // evaluations should not be masked if they are the arguments of
-    // transformational expressions.
-    if (isOutOfOrder && elemental.isOrdered())
-      return;
-    elementalOps.insert(elemental.getOperation());
-    for (mlir::Operation &op : elemental.getElementalRegion().getOps())
-      if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(op)) {
-        bool isUnorderedApply =
-            isOutOfOrder || !isInOrderApply(apply, elemental);
-        auto maybeElemental =
-            mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
-                apply.getExpr().getDefiningOp());
-        gatherElementalTree(maybeElemental, elementalOps, isUnorderedApply);
-      }
-  }
-}
-
 MaskedArrayExpr::MaskedArrayExpr(mlir::Location loc, mlir::Region &region,
                                  bool isOuterMaskExpr)
     : loc{loc}, region{region}, isOuterMaskExpr{isOuterMaskExpr} {
   mlir::Operation &terminator = region.back().back();
-  if (auto elementalAddr =
-          mlir::dyn_cast<hlfir::ElementalOpInterface>(terminator)) {
-    // Vector subscripted designator (hlfir.elemental_addr terminator).
-    gatherElementalTree(elementalAddr, elementalParts, /*isOutOfOrder=*/false);
-    return;
-  }
-  // Try if elemental expression.
-  mlir::Value entity = mlir::cast<hlfir::YieldOp>(terminator).getEntity();
-  auto maybeElemental = mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
-      entity.getDefiningOp());
-  gatherElementalTree(maybeElemental, elementalParts, /*isOutOfOrder=*/false);
+  elementalParts = hlfir::ElementalTree::buildElementalTree(terminator);
 }
 
 void MaskedArrayExpr::generateNoneElementalPart(fir::FirOpBuilder &builder,

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
index 63a5803878a2d..6bc5317b25d7a 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -8,6 +8,7 @@
 
 #include "ScheduleOrderedAssignments.h"
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Analysis/ArraySectionAnalyzer.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
@@ -23,7 +24,13 @@
 /// Log RAW or WAW conflict.
 [[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
                                          mlir::Value writtenOrReadVarA,
-                                         mlir::Value writtenVarB);
+                                         mlir::Value writtenVarB,
+                                         bool isAligned = false);
+/// Log when a region must be retroactively saved.
+[[maybe_unused]] static void
+logRetroactiveSave(llvm::raw_ostream &os, mlir::Region &yieldRegion,
+                   hlfir::Run &modifyingRun,
+                   hlfir::RegionAssignOp currentAssign);
 /// Log when an expression evaluation must be saved.
 [[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
                                                unsigned runid,
@@ -39,15 +46,129 @@ logStartScheduling(llvm::raw_ostream &os,
                    hlfir::OrderedAssignmentTreeOpInterface root);
 /// Log op if effect value is not known.
 [[maybe_unused]] static void
-logIfUnkownEffectValue(llvm::raw_ostream &os,
-                       mlir::MemoryEffects::EffectInstance effect,
-                       mlir::Operation &op);
+logIfUnknownEffectValue(llvm::raw_ostream &os,
+                        mlir::MemoryEffects::EffectInstance effect,
+                        mlir::Operation &op);
 
 //===----------------------------------------------------------------------===//
 // Scheduling Implementation
 //===----------------------------------------------------------------------===//
 
+/// Is the apply using all the elemental indices in order?
+static bool isInOrderApply(hlfir::ApplyOp apply,
+                           hlfir::ElementalOpInterface elemental) {
+  mlir::Region::BlockArgListType elementalIndices = elemental.getIndices();
+  if (elementalIndices.size() != apply.getIndices().size())
+    return false;
+  for (auto [elementalIdx, applyIdx] :
+       llvm::zip(elementalIndices, apply.getIndices()))
+    if (elementalIdx != applyIdx)
+      return false;
+  return true;
+}
+
+hlfir::ElementalTree
+hlfir::ElementalTree::buildElementalTree(mlir::Operation &regionTerminator) {
+  ElementalTree tree;
+  if (auto elementalAddr =
+          mlir::dyn_cast<hlfir::ElementalOpInterface>(regionTerminator)) {
+    // Vector subscripted designator (hlfir.elemental_addr terminator).
+    tree.gatherElementalTree(elementalAddr, /*isAppliedInOrder=*/true);
+    return tree;
+  }
+  // Try if elemental expression.
+  if (auto yield = mlir::dyn_cast<hlfir::YieldOp>(regionTerminator)) {
+    mlir::Value entity = yield.getEntity();
+    if (auto maybeElemental =
+            mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
+                entity.getDefiningOp()))
+      tree.gatherElementalTree(maybeElemental, /*isAppliedInOrder=*/true);
+  }
+  return tree;
+}
+
+// Check if op is an ElementalOpInterface that is part of this elemental tree.
+bool hlfir::ElementalTree::contains(mlir::Operation *op) const {
+  for (auto &p : tree)
+    if (p.first == op)
+      return true;
+  return false;
+}
+
+std::optional<bool> hlfir::ElementalTree::isOrdered(mlir::Operation *op) const {
+  for (auto &p : tree)
+    if (p.first == op)
+      return p.second;
+  return std::nullopt;
+}
+
+void hlfir::ElementalTree::gatherElementalTree(
+    hlfir::ElementalOpInterface elemental, bool isAppliedInOrder) {
+  if (!elemental)
+    return;
+  // Only inline an applied elemental that must be executed in order if the
+  // applying indices are in order. An hlfir::Elemental may have been created
+  // for a transformational like transpose, and Fortran 2018 standard
+  // section 10.2.3.2, point 10 imply that impure elemental sub-expression
+  // evaluations should not be masked if they are the arguments of
+  // transformational expressions.
+  if (!isAppliedInOrder && elemental.isOrdered())
+    return;
+
+  insert(elemental, isAppliedInOrder);
+  for (mlir::Operation &op : elemental.getElementalRegion().getOps())
+    if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(op)) {
+      bool isUnorderedApply =
+          !isAppliedInOrder || !isInOrderApply(apply, elemental);
+      auto maybeElemental = mlir::dyn_cast_or_null<hlfir::ElementalOpInterface>(
+          apply.getExpr().getDefiningOp());
+      gatherElementalTree(maybeElemental, !isUnorderedApply);
+    }
+}
+
+void hlfir::ElementalTree::insert(hlfir::ElementalOpInterface elementalOp,
+                                  bool isAppliedInOrder) {
+  tree.push_back({elementalOp.getOperation(), isAppliedInOrder});
+}
+
+static bool isInOrderDesignate(hlfir::DesignateOp designate,
+                               hlfir::ElementalTree *tree) {
+  if (!tree)
+    return false;
+  if (auto elemental =
+          designate->getParentOfType<hlfir::ElementalOpInterface>())
+    if (tree->isOrdered(elemental.getOperation()))
+      return fir::ArraySectionAnalyzer::isDesignatingArrayInOrder(designate,
+                                                                  elemental);
+  return false;
+}
+
+hlfir::DetailedEffectInstance::DetailedEffectInstance(
+    mlir::MemoryEffects::Effect *effect, mlir::OpOperand *value,
+    mlir::Value orderedElementalEffectOn)
+    : effectInstance(effect, value),
+      orderedElementalEffectOn(orderedElementalEffectOn) {}
+
+hlfir::DetailedEffectInstance::DetailedEffectInstance(
+    mlir::MemoryEffects::EffectInstance effectInst,
+    mlir::Value orderedElementalEffectOn)
+    : effectInstance(effectInst),
+      orderedElementalEffectOn(orderedElementalEffectOn) {}
+
+hlfir::DetailedEffectInstance
+hlfir::DetailedEffectInstance::getArrayReadEffect(mlir::OpOperand *array) {
+  return DetailedEffectInstance(mlir::MemoryEffects::Read::get(), array,
+                                array->get());
+}
+
+hlfir::DetailedEffectInstance
+hlfir::DetailedEffectInstance::getArrayWriteEffect(mlir::OpOperand *array) {
+  return DetailedEffectInstance(mlir::MemoryEffects::Write::get(), array,
+                                array->get());
+}
+
 namespace {
+
 /// Structure that is in charge of building the schedule. For each
 /// hlfir.region_assign inside an ordered assignment tree, it is walked through
 /// the parent operations and their "leaf" regions (that contain expression
@@ -99,20 +220,25 @@ class Scheduler {
 
   /// After all the dependent evaluation regions have been analyzed, create the
   /// action to evaluate the assignment that was being analyzed.
-  void finishSchedulingAssignment(hlfir::RegionAssignOp assign);
+  void finishSchedulingAssignment(hlfir::RegionAssignOp assign,
+                                  bool leafRegionsMayOnlyRead);
 
   /// Once all the assignments have been analyzed and scheduled, return the
   /// schedule. The scheduler object should not be used after this call.
   hlfir::Schedule moveSchedule() { return std::move(schedule); }
 
 private:
+  struct EvaluationState {
+    bool saved = false;
+    std::optional<hlfir::Schedule::iterator> modifiedInRun;
+  };
+
   /// Save a conflicting region that is evaluating an expression that is
   /// controlling or masking the current assignment, or is evaluating the
   /// RHS/LHS.
-  void
-  saveEvaluation(mlir::Region &yieldRegion,
-                 llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
-                 bool anyWrite);
+  void saveEvaluation(mlir::Region &yieldRegion,
+                      llvm::ArrayRef<hlfir::DetailedEffectInstance> effects,
+                      bool anyWrite);
 
   /// Can the current assignment be schedule with the previous run. This is
   /// only possible if the assignment and all of its dependencies have no side
@@ -120,19 +246,17 @@ class Scheduler {
   bool canFuseAssignmentWithPreviousRun();
 
   /// Memory effects of the assignments being lowered.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> assignEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> assignEffects;
   /// Memory effects of the evaluations implied by the assignments
   /// being lowered. They do not include the implicit writes
   /// to the LHS of the assignments.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> assignEvaluateEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> assignEvaluateEffects;
   /// Memory effects of the unsaved evaluation region that are controlling or
   /// masking the current assignments.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance>
-      parentEvaluationEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> parentEvaluationEffects;
   /// Same as parentEvaluationEffects, but for the current "leaf group" being
   /// analyzed scheduled.
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance>
-      independentEvaluationEffects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> independentEvaluationEffects;
 
   /// Were any region saved for the current assignment?
   bool savedAnyRegionForCurrentAssignment = false;
@@ -140,7 +264,10 @@ class Scheduler {
   // Schedule being built.
   hlfir::Schedule schedule;
   /// Leaf regions that have been saved so far.
-  llvm::SmallPtrSet<mlir::Region *, 16> savedRegions;
+  llvm::DenseMap<mlir::Region *, EvaluationState> regionStates;
+  /// Regions that have an aligned conflict with the current assignment.
+  llvm::SmallVector<mlir::Region *> pendingAlignedRegions;
+
   /// Is schedule.back() a schedule that is only saving region with read
   /// effects?
   bool currentRunIsReadOnly = false;
@@ -171,9 +298,10 @@ static bool isForallIndex(mlir::Value var) {
 /// side effect interface, or that are writing temporary variables that may be
 /// hard to identify as such (one would have to prove the write is "local" to
 /// the region even when the alloca may be outside of the region).
-static void gatherMemoryEffects(
+static void gatherMemoryEffectsImpl(
     mlir::Region &region, bool mayOnlyRead,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &effects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &effects,
+    hlfir::ElementalTree *tree = nullptr) {
   /// This analysis is a simple walk of all the operations of the region that is
   /// evaluating and yielding a value. This is a lot simpler and safer than
   /// trying to walk back the SSA DAG from the yielded value. But if desired,
@@ -181,7 +309,7 @@ static void gatherMemoryEffects(
   for (mlir::Operation &op : region.getOps()) {
     if (op.hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>()) {
       for (mlir::Region &subRegion : op.getRegions())
-        gatherMemoryEffects(subRegion, mayOnlyRead, effects);
+        gatherMemoryEffectsImpl(subRegion, mayOnlyRead, effects, tree);
       // In MLIR, RecursiveMemoryEffects can be combined with
       // MemoryEffectOpInterface to describe extra effects on top of the
       // effects of the nested operations.  However, the presence of
@@ -214,17 +342,45 @@ static void gatherMemoryEffects(
     interface.getEffects(opEffects);
     for (auto &effect : opEffects)
       if (!isForallIndex(effect.getValue())) {
+        mlir::Value array;
+        if (effect.getValue())
+          if (auto designate =
+                  effect.getValue().getDefiningOp<hlfir::DesignateOp>())
+            if (isInOrderDesignate(designate, tree))
+              array = designate.getMemref();
+
         if (mlir::isa<mlir::MemoryEffects::Read>(effect.getEffect())) {
-          LLVM_DEBUG(logIfUnkownEffectValue(llvm::dbgs(), effect, op););
-          effects.push_back(effect);
+          LLVM_DEBUG(logIfUnknownEffectValue(llvm::dbgs(), effect, op););
+          effects.emplace_back(effect, array);
         } else if (!mayOnlyRead &&
                    mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect())) {
-          LLVM_DEBUG(logIfUnkownEffectValue(llvm::dbgs(), effect, op););
-          effects.push_back(effect);
+          LLVM_DEBUG(logIfUnknownEffectValue(llvm::dbgs(), effect, op););
+          effects.emplace_back(effect, array);
         }
       }
   }
 }
+static void gatherMemoryEffects(
+    mlir::Region &region, bool mayOnlyRead,
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &effects) {
+  if (!region.getParentOfType<hlfir::ForallOp>()) {
+    // TODO: leverage array access analysis for FORALL.
+    // While FORALL assignments can be array assignments, the iteration space
+    // is also driven by the FORALL indices, so the way ArraySectionAnalyzer
+    // results are used is not adequate for it.
+    // For instance "disjoint" array access cannot be ignored in:
+    // "forall (i=1:10) x(i+1,:) = x(i,:)".
+    // While identical access can probably also be accepted, this would deserve
+    // more thinking, it would probably make sense to also deal with "aligned
+    // scalar" access for them like in "forall (i=1:10) x(i) = x(i) + 1".  For
+    // now this feature is disabled for inside FORALL.
+    hlfir::ElementalTree tree =
+        hlfir::ElementalTree::buildElementalTree(region.back().back());
+    gatherMemoryEffectsImpl(region, mayOnlyRead, effects, &tree);
+    return;
+  }
+  gatherMemoryEffectsImpl(region, mayOnlyRead, effects, /*tree=*/nullptr);
+}
 
 /// Return the entity yielded by a region, or a null value if the region
 /// is not terminated by a yield.
@@ -246,10 +402,14 @@ static mlir::OpOperand *getYieldedEntity(mlir::Region &region) {
 static void gatherAssignEffects(
     hlfir::RegionAssignOp regionAssign,
     bool userDefAssignmentMayOnlyWriteToAssignedVariable,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &assignEffects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &assignEffects) {
   mlir::OpOperand *assignedVar = getYieldedEntity(regionAssign.getLhsRegion());
   assert(assignedVar && "lhs cannot be an empty region");
-  assignEffects.emplace_back(mlir::MemoryEffects::Write::get(), assignedVar);
+  if (regionAssign->getParentOfType<hlfir::ForallOp>())
+    assignEffects.emplace_back(mlir::MemoryEffects::Write::get(), assignedVar);
+  else
+    assignEffects.emplace_back(
+        hlfir::DetailedEffectInstance::getArrayWriteEffect(assignedVar));
 
   if (!regionAssign.getUserDefinedAssignment().empty()) {
     // The write effect on the INTENT(OUT) LHS argument is already taken
@@ -273,7 +433,7 @@ static void gatherAssignEffects(
 static void gatherAssignEvaluationEffects(
     hlfir::RegionAssignOp regionAssign,
     bool userDefAssignmentMayOnlyWriteToAssignedVariable,
-    llvm::SmallVectorImpl<mlir::MemoryEffects::EffectInstance> &assignEffects) {
+    llvm::SmallVectorImpl<hlfir::DetailedEffectInstance> &assignEffects) {
   gatherMemoryEffects(regionAssign.getLhsRegion(),
                       userDefAssignmentMayOnlyWriteToAssignedVariable,
                       assignEffects);
@@ -308,12 +468,57 @@ static mlir::Value getStorageSource(mlir::Value var) {
   return source;
 }
 
+namespace {
+
+/// Class to represent conflicts between several accesses (effects) to a memory
+/// location (read after write, write after write).
+struct ConflictKind {
+  enum Kind {
+    // None: The effects are not affecting the same memory location, or they are
+    // all reads.
+    None,
+    // Aligned: There are both read and write effects affecting the same memory
+    // location, but it is known that these effects are all accessing the memory
+    // location element by element in array order. This means the conflict does
+    // not introduce loop-carried dependencies.
+    Aligned,
+    // Any: There may be both read and write effects affecting the same memory
+    // in any way.
+    Any
+  };
+  Kind kind;
+
+  ConflictKind(Kind k) : kind(k) {}
+
+  static ConflictKind none() { return ConflictKind(None); }
+  static ConflictKind aligned() { return ConflictKind(Aligned); }
+  static ConflictKind any() { return ConflictKind(Any); }
+
+  bool isNone() const { return kind == None; }
+  bool isAligned() const { return kind == Aligned; }
+  bool isAny() const { return kind == Any; }
+
+  // Merge conflicts:
+  // none || none -> none
+  // aligned || <not any> -> aligned
+  // any || _ -> any
+  ConflictKind operator||(const ConflictKind &other) const {
+    if (kind == Any || other.kind == Any)
+      return any();
+    if (kind == Aligned || other.kind == Aligned)
+      return aligned();
+    return none();
+  }
+};
+} // namespace
+
 /// Could there be any read or write in effectsA on a variable written to in
 /// effectsB?
-static bool
-anyRAWorWAW(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
-            llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsB,
+static ConflictKind
+anyRAWorWAW(llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsA,
+            llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsB,
             fir::AliasAnalysis &aliasAnalysis) {
+  ConflictKind result = ConflictKind::none();
   for (const auto &effectB : effectsB)
     if (mlir::isa<mlir::MemoryEffects::Write>(effectB.getEffect())) {
       mlir::Value writtenVarB = effectB.getValue();
@@ -325,38 +530,66 @@ anyRAWorWAW(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
           mlir::Value writtenOrReadVarA = effectA.getValue();
           if (!writtenVarB || !writtenOrReadVarA) {
             LLVM_DEBUG(
-                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB););
-            return true; // unknown conflict.
+                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB));
+            return ConflictKind::any(); // unknown conflict.
           }
           writtenOrReadVarA = getStorageSource(writtenOrReadVarA);
           if (!aliasAnalysis.alias(writtenOrReadVarA, writtenVarB).isNo()) {
+            mlir::Value arrayA = effectA.getOrderedElementalEffectOn();
+            mlir::Value arrayB = effectB.getOrderedElementalEffectOn();
+            if (arrayA && arrayB) {
+              if (arrayA == arrayB) {
+                result = result || ConflictKind::aligned();
+                LLVM_DEBUG(logConflict(llvm::dbgs(), writtenOrReadVarA,
+                                       writtenVarB, /*isAligned=*/true));
+                continue;
+              }
+              auto overlap = fir::ArraySectionAnalyzer::analyze(arrayA, arrayB);
+              if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 DefinitelyDisjoint)
+                continue;
+              if (overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 DefinitelyIdentical ||
+                  overlap == fir::ArraySectionAnalyzer::SlicesOverlapKind::
+                                 EitherIdenticalOrDisjoint) {
+                result = result || ConflictKind::aligned();
+                LLVM_DEBUG(logConflict(llvm::dbgs(), writtenOrReadVarA,
+                                       writtenVarB, /*isAligned=*/true));
+                continue;
+              }
+              LLVM_DEBUG(llvm::dbgs() << "conflicting arrays:" << arrayA
+                                      << " and " << arrayB << "\n");
+              return ConflictKind::any();
+            }
             LLVM_DEBUG(
-                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB););
-            return true;
+                logConflict(llvm::dbgs(), writtenOrReadVarA, writtenVarB));
+            return ConflictKind::any();
           }
         }
     }
-  return false;
+  return result;
 }
 
 /// Could there be any read or write in effectsA on a variable written to in
 /// effectsB, or any read in effectsB on a variable written to in effectsA?
-static bool
-conflict(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsA,
-         llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effectsB) {
+static ConflictKind
+conflict(llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsA,
+         llvm::ArrayRef<hlfir::DetailedEffectInstance> effectsB) {
   fir::AliasAnalysis aliasAnalysis;
   // (RAW || WAW) || (WAR || WAW).
-  return anyRAWorWAW(effectsA, effectsB, aliasAnalysis) ||
-         anyRAWorWAW(effectsB, effectsA, aliasAnalysis);
+  ConflictKind result = anyRAWorWAW(effectsA, effectsB, aliasAnalysis);
+  if (result.isAny())
+    return result;
+  return result || anyRAWorWAW(effectsB, effectsA, aliasAnalysis);
 }
 
 /// Could there be any write effects in "effects" affecting memory storages
 /// that are not local to the current region.
 static bool
-anyNonLocalWrite(llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
+anyNonLocalWrite(llvm::ArrayRef<hlfir::DetailedEffectInstance> effects,
                  mlir::Region &region) {
   return llvm::any_of(
-      effects, [&region](const mlir::MemoryEffects::EffectInstance &effect) {
+      effects, [&region](const hlfir::DetailedEffectInstance &effect) {
         if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect())) {
           if (mlir::Value v = effect.getValue()) {
             v = getStorageSource(v);
@@ -393,9 +626,9 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
   // If the region evaluation was previously executed and saved, the saved
   // value will be used when evaluating the current assignment and this has
   // no effects in the current assignment evaluation.
-  if (savedRegions.contains(&yieldRegion))
+  if (regionStates[&yieldRegion].saved)
     return;
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> effects;
+  llvm::SmallVector<hlfir::DetailedEffectInstance> effects;
   gatherMemoryEffects(yieldRegion, leafRegionsMayOnlyRead, effects);
   // Yield has no effect as such, but in the context of order assignments.
   // The order assignments will usually read the yielded entity (except for
@@ -404,8 +637,13 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
   // intent(inout)).
   if (yieldIsImplicitRead) {
     mlir::OpOperand *entity = getYieldedEntity(yieldRegion);
-    if (entity && hlfir::isFortranVariableType(entity->get().getType()))
-      effects.emplace_back(mlir::MemoryEffects::Read::get(), entity);
+    if (entity && hlfir::isFortranVariableType(entity->get().getType())) {
+      if (yieldRegion.getParentOfType<hlfir::ForallOp>())
+        effects.emplace_back(mlir::MemoryEffects::Read::get(), entity);
+      else
+        effects.emplace_back(
+            hlfir::DetailedEffectInstance::getArrayReadEffect(entity));
+    }
   }
   if (!leafRegionsMayOnlyRead && anyNonLocalWrite(effects, yieldRegion)) {
     // Region with write effect must be executed only once (unless all writes
@@ -415,33 +653,58 @@ void Scheduler::saveEvaluationIfConflict(mlir::Region &yieldRegion,
                    << "saving eval because write effect prevents re-evaluation"
                    << "\n";);
     saveEvaluation(yieldRegion, effects, /*anyWrite=*/true);
-  } else if (conflict(effects, assignEffects)) {
-    // Region that conflicts with the current assignments must be fully
-    // evaluated and saved before doing the assignment (Note that it may
-    // have already have been evaluated without saving it before, but this
-    // implies that it never conflicted with a prior assignment, so its value
-    // should be the same.)
-    saveEvaluation(yieldRegion, effects, /*anyWrite=*/false);
-  } else if (evaluationsMayConflict &&
-             conflict(effects, assignEvaluateEffects)) {
-    // If evaluations of the assignment may conflict with the yield
-    // evaluations, we have to save yield evaluation.
-    // For example, a WHERE mask might be written by the masked assignment
-    // evaluations, and it has to be saved in this case:
-    //   where (mask) r = f() ! function f modifies mask
-    saveEvaluation(yieldRegion, effects,
-                   anyNonLocalWrite(effects, yieldRegion));
   } else {
-    // Can be executed while doing the assignment.
-    independentEvaluationEffects.append(effects.begin(), effects.end());
+    ConflictKind conflictKind = conflict(effects, assignEffects);
+    if (conflictKind.isAny()) {
+      // Region that conflicts with the current assignments must be fully
+      // evaluated and saved before doing the assignment (Note that it may
+      // have already been evaluated without saving it before, but this
+      // implies that it never conflicted with a prior assignment, so its value
+      // should be the same.)
+      saveEvaluation(yieldRegion, effects, /*anyWrite=*/false);
+    } else {
+      if (conflictKind.isAligned())
+        pendingAlignedRegions.push_back(&yieldRegion);
+
+      if (evaluationsMayConflict &&
+          !conflict(effects, assignEvaluateEffects).isNone()) {
+        // If evaluations of the assignment may conflict with the yield
+        // evaluations, we have to save yield evaluation.
+        // For example, a WHERE mask might be written by the masked assignment
+        // evaluations, and it has to be saved in this case:
+        //   where (mask) r = f() ! function f modifies mask
+        saveEvaluation(yieldRegion, effects,
+                       anyNonLocalWrite(effects, yieldRegion));
+      } else {
+        // Can be executed while doing the assignment.
+        independentEvaluationEffects.append(effects.begin(), effects.end());
+      }
+    }
   }
 }
 
 void Scheduler::saveEvaluation(
     mlir::Region &yieldRegion,
-    llvm::ArrayRef<mlir::MemoryEffects::EffectInstance> effects,
-    bool anyWrite) {
+    llvm::ArrayRef<hlfir::DetailedEffectInstance> effects, bool anyWrite) {
   savedAnyRegionForCurrentAssignment = true;
+  auto &state = regionStates[&yieldRegion];
+  if (state.modifiedInRun) {
+    // The region was modified in a previous run, but we now realize we need its
+    // value. We must save it before that modification run.
+    auto &newRun = *schedule.emplace(*state.modifiedInRun, hlfir::Run{});
+    newRun.actions.emplace_back(hlfir::SaveEntity{&yieldRegion});
+    // We do not have the parent effects from that time easily available here.
+    // However, since we are saving a parent of the current assignment, its
+    // parents are also parents of the current assignment.
+    newRun.memoryEffects.append(parentEvaluationEffects.begin(),
+                                parentEvaluationEffects.end());
+    newRun.memoryEffects.append(effects.begin(), effects.end());
+    state.saved = true;
+    LLVM_DEBUG(
+        logSaveEvaluation(llvm::dbgs(), /*runid=*/0, yieldRegion, anyWrite););
+    return;
+  }
+
   if (anyWrite) {
     // Create a new run just for regions with side effect. Further analysis
     // could try to prove the effects do not conflict with the previous
@@ -465,7 +728,7 @@ void Scheduler::saveEvaluation(
   schedule.back().memoryEffects.append(parentEvaluationEffects.begin(),
                                        parentEvaluationEffects.end());
   schedule.back().memoryEffects.append(effects.begin(), effects.end());
-  savedRegions.insert(&yieldRegion);
+  state.saved = true;
   LLVM_DEBUG(
       logSaveEvaluation(llvm::dbgs(), schedule.size(), yieldRegion, anyWrite););
 }
@@ -476,18 +739,78 @@ bool Scheduler::canFuseAssignmentWithPreviousRun() {
   if (savedAnyRegionForCurrentAssignment || schedule.empty())
     return false;
   auto &previousRunEffects = schedule.back().memoryEffects;
-  return !conflict(previousRunEffects, assignEffects) &&
-         !conflict(previousRunEffects, parentEvaluationEffects) &&
-         !conflict(previousRunEffects, independentEvaluationEffects);
+  return !conflict(previousRunEffects, assignEffects).isAny() &&
+         !conflict(previousRunEffects, parentEvaluationEffects).isAny() &&
+         !conflict(previousRunEffects, independentEvaluationEffects).isAny();
+}
+
+/// Gather the parents of (not included) \p node in reverse execution order.
+static void gatherParents(
+    hlfir::OrderedAssignmentTreeOpInterface node,
+    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
+  while (node) {
+    auto parent =
+        mlir::dyn_cast_or_null<hlfir::OrderedAssignmentTreeOpInterface>(
+            node->getParentOp());
+    if (parent && parent.getSubTreeRegion() == node->getParentRegion()) {
+      parents.push_back(parent);
+      node = parent;
+    } else {
+      break;
+    }
+  }
+}
+
+// Build the list of the parent nodes for this assignment. The list is built
+// from the closest parent until the ordered assignment tree root (this is the
+// reverse of their execution order).
+static void gatherAssignmentParents(
+    hlfir::RegionAssignOp assign,
+    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
+  gatherParents(mlir::cast<hlfir::OrderedAssignmentTreeOpInterface>(
+                    assign.getOperation()),
+                parents);
 }
 
-void Scheduler::finishSchedulingAssignment(hlfir::RegionAssignOp assign) {
-  // For now, always schedule each assignment in its own run. They could
-  // be done as part of previous assignment runs if it is proven they have
-  // no conflicting effects.
+void Scheduler::finishSchedulingAssignment(hlfir::RegionAssignOp assign,
+                                           bool leafRegionsMayOnlyRead) {
+  // Schedule the assignment in a new run, unless it can be fused with the
+  // previous run (if enabled and proven safe).
   currentRunIsReadOnly = false;
-  if (!tryFusingAssignments || !canFuseAssignmentWithPreviousRun())
+  bool fuse = tryFusingAssignments && canFuseAssignmentWithPreviousRun();
+  if (!fuse) {
+    // If we cannot fuse, we are about to start a new run.
+    // Check if any parent region was modified in a previous run and needs to be
+    // saved.
+    llvm::SmallVector<hlfir::OrderedAssignmentTreeOpInterface> parents;
+    gatherAssignmentParents(assign, parents);
+    for (auto parent : parents) {
+      llvm::SmallVector<mlir::Region *, 4> yieldRegions;
+      parent.getLeafRegions(yieldRegions);
+      for (mlir::Region *yieldRegion : yieldRegions) {
+        if (regionStates[yieldRegion].modifiedInRun &&
+            !regionStates[yieldRegion].saved) {
+          LLVM_DEBUG(logRetroactiveSave(
+              llvm::dbgs(), *yieldRegion,
+              **regionStates[yieldRegion].modifiedInRun, assign));
+          llvm::SmallVector<hlfir::DetailedEffectInstance> effects;
+          gatherMemoryEffects(*yieldRegion, leafRegionsMayOnlyRead, effects);
+          saveEvaluation(*yieldRegion, effects,
+                         anyNonLocalWrite(effects, *yieldRegion));
+        }
+      }
+    }
     schedule.emplace_back(hlfir::Run{});
+  }
+
+  // Mark pending aligned regions as modified in the current run (which is the
+  // last one).
+  auto runIt = std::prev(schedule.end());
+  for (mlir::Region *region : pendingAlignedRegions)
+    if (!regionStates[region].saved)
+      regionStates[region].modifiedInRun = runIt;
+  pendingAlignedRegions.clear();
+
   schedule.back().actions.emplace_back(assign);
   // TODO: when fusing, it would probably be best to filter the
   // parentEvaluationEffects that already in the previous run effects (since
@@ -530,34 +853,6 @@ gatherAssignments(hlfir::OrderedAssignmentTreeOpInterface root,
   }
 }
 
-/// Gather the parents of (not included) \p node in reverse execution order.
-static void gatherParents(
-    hlfir::OrderedAssignmentTreeOpInterface node,
-    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
-  while (node) {
-    auto parent =
-        mlir::dyn_cast_or_null<hlfir::OrderedAssignmentTreeOpInterface>(
-            node->getParentOp());
-    if (parent && parent.getSubTreeRegion() == node->getParentRegion()) {
-      parents.push_back(parent);
-      node = parent;
-    } else {
-      break;
-    }
-  }
-}
-
-// Build the list of the parent nodes for this assignment. The list is built
-// from the closest parent until the ordered assignment tree root (this is the
-// revere of their execution order).
-static void gatherAssignmentParents(
-    hlfir::RegionAssignOp assign,
-    llvm::SmallVectorImpl<hlfir::OrderedAssignmentTreeOpInterface> &parents) {
-  gatherParents(mlir::cast<hlfir::OrderedAssignmentTreeOpInterface>(
-                    assign.getOperation()),
-                parents);
-}
-
 hlfir::Schedule
 hlfir::buildEvaluationSchedule(hlfir::OrderedAssignmentTreeOpInterface root,
                                bool tryFusingAssignments) {
@@ -616,7 +911,7 @@ hlfir::buildEvaluationSchedule(hlfir::OrderedAssignmentTreeOpInterface root,
                                          leafRegionsMayOnlyRead,
                                          /*yieldIsImplicitRead=*/false);
     scheduler.finishIndependentEvaluationGroup();
-    scheduler.finishSchedulingAssignment(assign);
+    scheduler.finishSchedulingAssignment(assign, leafRegionsMayOnlyRead);
   }
   return scheduler.moveSchedule();
 }
@@ -704,6 +999,25 @@ static llvm::raw_ostream &printRegionPath(llvm::raw_ostream &os,
   return printRegionId(os, yieldRegion);
 }
 
+[[maybe_unused]] static void
+logRetroactiveSave(llvm::raw_ostream &os, mlir::Region &yieldRegion,
+                   hlfir::Run &modifyingRun,
+                   hlfir::RegionAssignOp currentAssign) {
+  printRegionPath(os, yieldRegion) << " is modified in order by ";
+  bool first = true;
+  for (auto &action : modifyingRun.actions) {
+    if (auto *assign = std::get_if<hlfir::RegionAssignOp>(&action)) {
+      if (!first)
+        os << ", ";
+      printNodePath(os, assign->getOperation());
+      first = false;
+    }
+  }
+  os << " and is needed by ";
+  printNodePath(os, currentAssign.getOperation());
+  os << " that is scheduled in a later run\n";
+}
+
 [[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
                                                unsigned runid,
                                                mlir::Region &yieldRegion,
@@ -721,13 +1035,14 @@ logAssignmentEvaluation(llvm::raw_ostream &os, unsigned runid,
 
 [[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
                                          mlir::Value writtenOrReadVarA,
-                                         mlir::Value writtenVarB) {
+                                         mlir::Value writtenVarB,
+                                         bool isAligned) {
   auto printIfValue = [&](mlir::Value var) -> llvm::raw_ostream & {
     if (!var)
       return os << "<unknown>";
     return os << var;
   };
-  os << "conflict: R/W: ";
+  os << "conflict" << (isAligned ? " (aligned)" : "") << ": R/W: ";
   printIfValue(writtenOrReadVarA) << " W:";
   printIfValue(writtenVarB) << "\n";
 }
@@ -743,9 +1058,9 @@ logStartScheduling(llvm::raw_ostream &os,
 }
 
 [[maybe_unused]] static void
-logIfUnkownEffectValue(llvm::raw_ostream &os,
-                       mlir::MemoryEffects::EffectInstance effect,
-                       mlir::Operation &op) {
+logIfUnknownEffectValue(llvm::raw_ostream &os,
+                        mlir::MemoryEffects::EffectInstance effect,
+                        mlir::Operation &op) {
   if (effect.getValue() != nullptr)
     return;
   os << "unknown effected value (";

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
index 2ed242edc973a..7f479ab166b15 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.h
@@ -15,9 +15,30 @@
 #define OPTIMIZER_HLFIR_TRANSFORM_SCHEDULEORDEREDASSIGNMENTS_H
 
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <list>
 
 namespace hlfir {
 
+struct ElementalTree {
+  // build an elemental tree given a masked region terminator.
+  static ElementalTree buildElementalTree(mlir::Operation &regionTerminator);
+  // Check if op is an ElementalOpInterface that is part of this elemental tree.
+  bool contains(mlir::Operation *op) const;
+
+  std::optional<bool> isOrdered(mlir::Operation *op) const;
+
+private:
+  void gatherElementalTree(hlfir::ElementalOpInterface elemental,
+                           bool isAppliedInOrder);
+  void insert(hlfir::ElementalOpInterface elementalOp, bool isAppliedInOrder);
+  // List of ElementalOpInterface operation forming this tree, as well as a
+  // Boolean to indicate if they are applied in order (that is, if their
+  // indexing space is the same as the one for the array yielded by the mask
+  // region that owns this tree).
+  llvm::SmallVector<std::pair<mlir::Operation *, bool>> tree;
+};
+
 /// Structure to represent that the value yielded by some region
 /// must be fully evaluated and saved for all index values at
 /// a given point of the ordered assignment tree evaluation.
@@ -29,6 +50,37 @@ struct SaveEntity {
   mlir::Value getSavedValue();
 };
 
+/// Wrapper class around mlir::MemoryEffects::EffectInstance that
+/// allows providing an extra array value that indicates that the
+/// effect is done element by element in array order (one element
+/// accessed at each iteration of the ordered assignment iteration
+/// space).
+class DetailedEffectInstance {
+public:
+  DetailedEffectInstance(mlir::MemoryEffects::Effect *effect,
+                         mlir::OpOperand *value = nullptr,
+                         mlir::Value orderedElementalEffectOn = nullptr);
+  DetailedEffectInstance(mlir::MemoryEffects::EffectInstance effectInstance,
+                         mlir::Value orderedElementalEffectOn = nullptr);
+
+  static DetailedEffectInstance getArrayReadEffect(mlir::OpOperand *array);
+  static DetailedEffectInstance getArrayWriteEffect(mlir::OpOperand *array);
+
+  mlir::Value getValue() const { return effectInstance.getValue(); }
+  mlir::MemoryEffects::Effect *getEffect() const {
+    return effectInstance.getEffect();
+  }
+  mlir::Value getOrderedElementalEffectOn() const {
+    return orderedElementalEffectOn;
+  }
+
+private:
+  mlir::MemoryEffects::EffectInstance effectInstance;
+  // Array whose elements are affected in array order by the
+  // ordered assignment iterations. Null value otherwise.
+  mlir::Value orderedElementalEffectOn;
+};
+
 /// A run is a list of actions required to evaluate an ordered assignment tree
 /// that can be done in the same loop nest.
 /// The actions can evaluate and saves element values into temporary or evaluate
@@ -42,11 +94,11 @@ struct Run {
   /// the assignment part of an hlfir::RegionAssignOp.
   using Action = std::variant<hlfir::RegionAssignOp, SaveEntity>;
   llvm::SmallVector<Action> actions;
-  llvm::SmallVector<mlir::MemoryEffects::EffectInstance> memoryEffects;
+  llvm::SmallVector<DetailedEffectInstance> memoryEffects;
 };
 
 /// List of runs to be executed in order to evaluate an order assignment tree.
-using Schedule = llvm::SmallVector<Run>;
+using Schedule = std::list<Run>;
 
 /// Example of schedules and run, and what they mean:
 ///  Fortran: forall (i=i:10) x(i) = y(i)

diff  --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6812347a8d39b..f15b0fe20bd9b 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -290,7 +290,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
           pm, hlfir::createInlineHLFIRCopyIn);
     }
   }
-  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
+  pm.addPass(hlfir::createLowerHLFIROrderedAssignments(
+      {/*tryFusingAssignments=*/optLevel.isOptimizingForSpeed()}));
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
 
   hlfir::BufferizeHLFIROptions bufferizeOptions;

diff  --git a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
index 0724d019537c0..064b12b9ed812 100644
--- a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
+++ b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
@@ -198,13 +198,15 @@ func.func @mask_and_rhs_conflict(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
   %c0 = arith.constant 0 : index
   %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
   %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
   hlfir.where {
     %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
     %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
     %3 = hlfir.elemental %2 : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
     ^bb0(%arg1: index):
-      %4 = hlfir.designate %0#0 (%arg1)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      %add = arith.addi %arg1, %c1 : index
+      %4 = hlfir.designate %0#0 (%add)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
       %5 = fir.load %4 : !fir.ref<i32>
       %6 = arith.cmpi sgt, %5, %c42_i32 : i32
       %7 = fir.convert %6 : (i1) -> !fir.logical<4>
@@ -226,12 +228,14 @@ func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 // CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 // CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
 // CHECK:           ^bb0(%[[VAL_7:.*]]: index):
-// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_7]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[ADD:.*]] = arith.addi %[[VAL_7]], %[[C1]] : index
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[ADD]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
 // CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_2]] : i32
 // CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i1) -> !fir.logical<4>

diff  --git a/flang/test/HLFIR/order_assignments/where-array-sections.f90 b/flang/test/HLFIR/order_assignments/where-array-sections.f90
new file mode 100644
index 0000000000000..aab264d6e105e
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/where-array-sections.f90
@@ -0,0 +1,128 @@
+! Test scheduling of WHERE with aligned array sections.
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=false})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix NOFUSE
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=true})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix FUSE
+
+!REQUIRES: asserts
+
+subroutine no_temps(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,1) = var2(:,2)
+    var3(:) = var3(:) - var2(:,2)
+    var2(:,2) = 0.
+  end where
+end
+
+subroutine must_create_mask_temp_if_not_fused(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,2) = 0. ! -> modifies mask 1-1 
+    var2(:,1) = var2(:,2)
+    var3(:) = var3(:) - var2(:,2)
+  end where
+end
+
+subroutine must_split_and_create_temps(var1, var2, var3)
+  implicit none
+  real, contiguous, dimension(:,:) :: var1, var2
+  real, contiguous, dimension(:) :: var3
+
+  where (var2(:,2) < 0.)
+    var2(:,1) = var2(:,1) + var2(:,2)
+    var2(:,2) = 0. ! -> modifies mask 1-1
+    ! RHS/LHS overlap require saving RHS and splitting loops, which requires
+    ! also saving the mask before the assignment above.
+    var2(:,1) = var2(2,:) + var2(2,:)
+    var3(:) = var3(:) - var2(:,2)
+  end where
+end
+
+!NOFUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 3 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign4
+!NOFUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!NOFUSE-NEXT: run 0 save    : where/mask
+!NOFUSE-NEXT: run 4 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 5 evaluate: where/region_assign4
+!NOFUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
+!NOFUSE-NEXT: run 3 save    : where/region_assign3/rhs
+!NOFUSE-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!NOFUSE-NEXT: run 0 save    : where/mask
+!NOFUSE-NEXT: run 5 evaluate: where/region_assign3
+!NOFUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!NOFUSE-NEXT: run 6 evaluate: where/region_assign4
+
+!FUSE-LABEL: ------------ scheduling where in _QPno_temps ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign4
+!FUSE-LABEL: ------------ scheduling where in _QPmust_create_mask_temp_if_not_fused ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign4
+!FUSE-LABEL: ------------ scheduling where in _QPmust_split_and_create_temps ------------
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>> W:%{{.*}} = fir.box_addr %arg1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-NEXT: conflicting arrays:%{{.*}} and %{{.*}}
+!FUSE-NEXT: run 2 save    : where/region_assign3/rhs
+!FUSE-NEXT: where/mask is modified in order by where/region_assign1, where/region_assign2 and is needed by where/region_assign3 that is scheduled in a later run
+!FUSE-NEXT: run 0 save    : where/mask
+!FUSE-NEXT: run 4 evaluate: where/region_assign3
+!FUSE-NEXT: conflict (aligned): R/W: %{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> W:%{{.*}} = fir.box_addr %arg2 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+!FUSE-NEXT: run 4 evaluate: where/region_assign4

diff  --git a/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
index 08d4092b49aef..1de457f974508 100644
--- a/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
@@ -37,5 +37,6 @@ subroutine unfusable(x, y, mask)
 !FUSE-NEXT: run 1 evaluate: where/region_assign2
 !FUSE-LABEL: ------------ scheduling where in _QPunfusable ------------
 !FUSE-NEXT: run 1 evaluate: where/region_assign1
-!FUSE-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1
+!FUSE-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1
+!FUSE-NEXT: conflicting arrays:{{.*}} and {{.*}}
 !FUSE-NEXT: run 2 evaluate: where/region_assign2

diff  --git a/flang/test/HLFIR/order_assignments/where-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-scheduling.f90
index 6feaba0d3389a..496789334b84e 100644
--- a/flang/test/HLFIR/order_assignments/where-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-scheduling.f90
@@ -127,12 +127,30 @@ end function f
   end where
 end subroutine
 
+subroutine where_construct_need_to_be_split_no_temps(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.)
+    x = y
+  elsewhere (x(ubound(x,1):1:-1, :).gt.0)
+    y = x
+  end where
+end subroutine
+
+subroutine where_construct_need_to_be_split_with_temps(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.)
+    x = y
+    y = 0.
+  elsewhere (x(ubound(x,1):1:-1, :).gt.0)
+    y = x
+  end where
+end subroutine
+
 !CHECK-LABEL: ------------ scheduling where in _QPno_conflict ------------
 !CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPfake_conflict ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPonly_once ------------
 !CHECK-NEXT: unknown effect: %11 = fir.call @_QPcall_me_only_once() fastmath<contract> : () -> !fir.array<10x!fir.logical<4>>
 !CHECK-NEXT: saving eval because write effect prevents re-evaluation
@@ -148,24 +166,22 @@ end function f
 !CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict ------------
 !CHECK-NEXT: run 1 evaluate: where/region_assign1
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
-!CHECK-NEXT: run 2 save    : where/mask
-!CHECK-NEXT: run 3 evaluate: where/elsewhere1/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict_2 ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
-!CHECK-NEXT: run 3 save    : where/elsewhere1/mask
-!CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: where/mask is modified in order by where/region_assign1 and is needed by where/elsewhere1/region_assign1 that is scheduled in a later run
+!CHECK-NEXT: run 0 save    : where/mask
+!CHECK-NEXT: run 3 evaluate: where/elsewhere1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_1 ------------
 !CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_2 ------------
-!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0
-!CHECK-NEXT: run 1 save    : where/mask
-!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling forall in _QPwhere_in_forall_conflict ------------
 !CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
 !CHECK-NEXT: run 1 save    : forall/where1/mask
@@ -195,3 +211,15 @@ end function f
 !CHECK-NEXT: saving eval because write effect prevents re-evaluation
 !CHECK-NEXT: run 3 save  (w): where/elsewhere1/region_assign1/rhs
 !CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_need_to_be_split_no_temps ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_need_to_be_split_with_temps ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 evaluate: where/region_assign2
+!CHECK-NEXT: conflict (aligned): R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: where/mask is modified in order by where/region_assign2 and is needed by where/elsewhere1/region_assign1 that is scheduled in a later run
+!CHECK-NEXT: run 0 save    : where/mask
+!CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1


        


More information about the flang-commits mailing list