[flang-commits] [flang] [flang] avoid introducing iteration dependencies in WHERE and FORALL temporaries (PR #195053)

via flang-commits flang-commits at lists.llvm.org
Thu Apr 30 07:15:14 PDT 2026


https://github.com/jeanPerier updated https://github.com/llvm/llvm-project/pull/195053

>From 9e9b9f9334b9879e86a21e5629e79ed0395d3e8d Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Thu, 30 Apr 2026 03:19:58 -0700
Subject: [PATCH 1/2] [flang] avoid introducing iteration dependencies in WHERE
 temporaries

---
 .../Optimizer/Builder/TemporaryStorage.h      |  54 +++-
 .../Optimizer/Builder/TemporaryStorage.cpp    |  82 +++++
 .../LowerHLFIROrderedAssignments.cpp          |  92 +++++-
 .../array-temp-many-forall.f90                |  45 +++
 .../HLFIR/order_assignments/array-temp.fir    | 112 +++++++
 .../HLFIR/order_assignments/impure-where.fir  |  16 +-
 .../order_assignments/inlined-stack-temp.fir  | 302 ++++++++----------
 .../order_assignments/saving-mask-and-rhs.fir |  18 +-
 .../user-defined-assignment.fir               |  19 +-
 9 files changed, 514 insertions(+), 226 deletions(-)
 create mode 100644 flang/test/HLFIR/order_assignments/array-temp-many-forall.f90
 create mode 100644 flang/test/HLFIR/order_assignments/array-temp.fir

diff --git a/flang/include/flang/Optimizer/Builder/TemporaryStorage.h b/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
index cdb23a64c5c8a..e1edc5912ae97 100644
--- a/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
+++ b/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
@@ -19,6 +19,7 @@
 #ifndef FORTRAN_OPTIMIZER_BUILDER_TEMPORARYSTORAGE_H
 #define FORTRAN_OPTIMIZER_BUILDER_TEMPORARYSTORAGE_H
 
+#include "flang/Common/idioms.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 
 namespace fir {
@@ -98,6 +99,34 @@ class HomogeneousScalarStack {
   mlir::Value temp;
 };
 
+/// Multidimensional temporary indexed directly by the enclosing loop induction
+/// variables (innermost loop is the first dimension). The indices passed to
+/// pushValue/fetch are interpreted in the array's domain, which is described
+/// by a fir.shape_shift built from the loop extents and lower bounds. This
+/// avoids the loop-carried counter used by HomogeneousScalarStack, keeping
+/// loop iterations independent. Limited to Fortran::common::maxRank dimensions.
+class ArrayTemp {
+public:
+  ArrayTemp(mlir::Location loc, fir::FirOpBuilder &builder,
+            fir::SequenceType declaredType, llvm::ArrayRef<mlir::Value> extents,
+            llvm::ArrayRef<mlir::Value> lowerBounds,
+            llvm::ArrayRef<mlir::Value> lengths, bool allocateOnHeap,
+            llvm::StringRef name);
+
+  void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
+                 mlir::Value value, mlir::ValueRange indices);
+  void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder) {}
+  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder,
+                    mlir::ValueRange indices);
+  void destroy(mlir::Location loc, fir::FirOpBuilder &builder);
+  bool canBeFetchedAfterPush() const { return true; }
+
+private:
+  const bool allocateOnHeap;
+  mlir::Value temp;
+  llvm::SmallVector<mlir::Value> typeParams;
+};
+
 /// Structure to hold the value of a single entity.
 class SimpleCopy {
 public:
@@ -255,16 +284,26 @@ class TemporaryStorage {
   TemporaryStorage(T &&impl) : impl{std::forward<T>(impl)} {}
 
   void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                 mlir::Value value) {
-    std::visit([&](auto &temp) { temp.pushValue(loc, builder, value); }, impl);
+                 mlir::Value value, mlir::ValueRange indices = {}) {
+    // Only ArrayTemp uses the loop indices; other temps don't take them.
+    std::visit(Fortran::common::visitors{
+                   [&](ArrayTemp &temp) {
+                     temp.pushValue(loc, builder, value, indices);
+                   },
+                   [&](auto &temp) { temp.pushValue(loc, builder, value); }},
+               impl);
   }
   void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder) {
     std::visit([&](auto &temp) { temp.resetFetchPosition(loc, builder); },
                impl);
   }
-  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder) {
-    return std::visit([&](auto &temp) { return temp.fetch(loc, builder); },
-                      impl);
+  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder,
+                    mlir::ValueRange indices = {}) {
+    return std::visit(
+        Fortran::common::visitors{
+            [&](ArrayTemp &temp) { return temp.fetch(loc, builder, indices); },
+            [&](auto &temp) { return temp.fetch(loc, builder); }},
+        impl);
   }
   void destroy(mlir::Location loc, fir::FirOpBuilder &builder) {
     std::visit([&](auto &temp) { temp.destroy(loc, builder); }, impl);
@@ -282,8 +321,9 @@ class TemporaryStorage {
   }
 
 private:
-  std::variant<HomogeneousScalarStack, SimpleCopy, SSARegister, AnyValueStack,
-               AnyVariableStack, AnyVectorSubscriptStack, AnyAddressStack>
+  std::variant<HomogeneousScalarStack, ArrayTemp, SimpleCopy, SSARegister,
+               AnyValueStack, AnyVariableStack, AnyVectorSubscriptStack,
+               AnyAddressStack>
       impl;
 };
 } // namespace fir::factory
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 5db40aff91878..0233fc9f023de 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -134,6 +134,88 @@ hlfir::Entity fir::factory::HomogeneousScalarStack::moveStackAsArrayExpr(
   return hlfir::Entity{hlfirExpr};
 }
 
+//===----------------------------------------------------------------------===//
+// fir::factory::ArrayTemp implementation.
+//===----------------------------------------------------------------------===//
+
+fir::factory::ArrayTemp::ArrayTemp(mlir::Location loc,
+                                   fir::FirOpBuilder &builder,
+                                   fir::SequenceType declaredType,
+                                   llvm::ArrayRef<mlir::Value> extents,
+                                   llvm::ArrayRef<mlir::Value> lowerBounds,
+                                   llvm::ArrayRef<mlir::Value> lengths,
+                                   bool allocateOnHeap, llvm::StringRef name)
+    : allocateOnHeap{allocateOnHeap},
+      typeParams{lengths.begin(), lengths.end()} {
+  assert(extents.size() == lowerBounds.size() &&
+         "extents and lowerBounds must have the same size");
+  assert(extents.size() == declaredType.getDimension() &&
+         "declared type rank must match the number of extents");
+  mlir::Value tempStorage;
+  if (allocateOnHeap)
+    tempStorage =
+        builder.createHeapTemporary(loc, declaredType, name, extents, lengths);
+  else
+    tempStorage =
+        builder.createTemporary(loc, declaredType, name, extents, lengths);
+  // Use a fir.shape_shift so the temp's lower bounds match the loop bounds:
+  // the indices passed to pushValue/fetch can then index it directly.
+  mlir::Value shape = builder.genShape(loc, lowerBounds, extents);
+  temp =
+      hlfir::DeclareOp::create(builder, loc, tempStorage, name, shape, lengths)
+          .getBase();
+}
+
+/// Generate an hlfir.designate on \p temp for the element at \p indices. The
+/// indices are interpreted in the temp's array domain (matching its lower
+/// bounds, which were set from the enclosing loop bounds).
+static mlir::Value genArrayTempElementAddr(mlir::Location loc,
+                                           fir::FirOpBuilder &builder,
+                                           mlir::Value temp,
+                                           mlir::ValueRange indices,
+                                           mlir::ValueRange typeParams) {
+  hlfir::Entity entity{temp};
+  mlir::Type refTy = fir::ReferenceType::get(entity.getFortranElementType());
+  mlir::Type idxTy = builder.getIndexType();
+  llvm::SmallVector<mlir::Value> idxs;
+  idxs.reserve(indices.size());
+  for (mlir::Value idx : indices)
+    idxs.push_back(builder.createConvert(loc, idxTy, idx));
+  return hlfir::DesignateOp::create(builder, loc, refTy, temp, idxs,
+                                    typeParams);
+}
+
+void fir::factory::ArrayTemp::pushValue(mlir::Location loc,
+                                        fir::FirOpBuilder &builder,
+                                        mlir::Value value,
+                                        mlir::ValueRange indices) {
+  hlfir::Entity entity{value};
+  assert(entity.isScalar() && "cannot use ArrayTemp with array");
+  // Match HomogeneousScalarStack: derived types go through the runtime path.
+  if (!entity.hasIntrinsicType())
+    TODO(loc, "creating ArrayTemp for derived types");
+  mlir::Value addr =
+      genArrayTempElementAddr(loc, builder, temp, indices, typeParams);
+  hlfir::AssignOp::create(builder, loc, value, addr);
+}
+
+mlir::Value fir::factory::ArrayTemp::fetch(mlir::Location loc,
+                                           fir::FirOpBuilder &builder,
+                                           mlir::ValueRange indices) {
+  mlir::Value addr =
+      genArrayTempElementAddr(loc, builder, temp, indices, typeParams);
+  return hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{addr});
+}
+
+void fir::factory::ArrayTemp::destroy(mlir::Location loc,
+                                      fir::FirOpBuilder &builder) {
+  if (allocateOnHeap) {
+    auto declare = temp.getDefiningOp<hlfir::DeclareOp>();
+    assert(declare && "temp must have been declared");
+    fir::FreeMemOp::create(builder, loc, declare.getMemref());
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // fir::factory::SimpleCopy implementation.
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index a3fd19d95fbbc..08a43e476c0cd 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScheduleOrderedAssignments.h"
+#include "flang/Common/Fortran-consts.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/TemporaryStorage.h"
@@ -257,6 +258,11 @@ class OrderedAssignmentRewriter {
   bool currentLoopNestIterationNumberCanBeComputed(
       llvm::SmallVectorImpl<fir::DoLoopOp> &loopNest);
 
+  /// Return the induction variables of the enclosing fir.do_loop nest at the
+  /// current insertion point, innermost first (same order as
+  /// currentLoopNestIterationNumberCanBeComputed). Used to index ArrayTemp.
+  llvm::SmallVector<mlir::Value> getLoopIndices();
+
   template <typename T>
   fir::factory::TemporaryStorage *insertSavedEntity(mlir::Region &region,
                                                     T &&temp) {
@@ -666,10 +672,12 @@ OrderedAssignmentRewriter::getIfSaved(mlir::Region &region) {
   if (auto savedInSameRun = savedInCurrentRunBeforeUse.find(&region);
       savedInSameRun != savedInCurrentRunBeforeUse.end())
     return savedInSameRun->second;
-  // If the region was saved in a previous run, fetch the saved value.
+  // If the region was saved in a previous run, fetch the saved value. The
+  // loop indices are only used by ArrayTemp; the other temps ignore them.
   if (auto temp = savedEntities.find(&region); temp != savedEntities.end()) {
     doBeforeLoopNest([&]() { temp->second.resetFetchPosition(loc, builder); });
-    return ValueAndCleanUp{temp->second.fetch(loc, builder), std::nullopt};
+    return ValueAndCleanUp{temp->second.fetch(loc, builder, getLoopIndices()),
+                           std::nullopt};
   }
   return std::nullopt;
 }
@@ -1109,6 +1117,41 @@ computeLoopNestIterationNumber(mlir::Location loc, fir::FirOpBuilder &builder,
   return loopExtent;
 }
 
+/// Compute the extents and lower bounds of \p loopNest, in the same order as
+/// \p loopNest (innermost first).
+static void computeLoopNestExtentsAndLowerBounds(
+    mlir::Location loc, fir::FirOpBuilder &builder,
+    llvm::ArrayRef<fir::DoLoopOp> loopNest,
+    llvm::SmallVectorImpl<mlir::Value> &extents,
+    llvm::SmallVectorImpl<mlir::Value> &lowerBounds) {
+  extents.reserve(extents.size() + loopNest.size());
+  lowerBounds.reserve(lowerBounds.size() + loopNest.size());
+  for (fir::DoLoopOp doLoop : loopNest) {
+    mlir::Value extent = builder.genExtentFromTriplet(
+        loc, doLoop.getLowerBound(), doLoop.getUpperBound(), doLoop.getStep(),
+        builder.getIndexType());
+    extents.push_back(extent);
+    lowerBounds.push_back(doLoop.getLowerBound());
+  }
+}
+
+llvm::SmallVector<mlir::Value> OrderedAssignmentRewriter::getLoopIndices() {
+  llvm::SmallVector<mlir::Value> indices;
+  if (constructStack.empty())
+    return indices;
+  mlir::Operation *outerLoop = constructStack[0];
+  mlir::Operation *currentConstruct = constructStack.back();
+  while (currentConstruct) {
+    if (auto doLoop = mlir::dyn_cast<fir::DoLoopOp>(currentConstruct))
+      indices.push_back(doLoop.getInductionVar());
+    if (currentConstruct == outerLoop)
+      currentConstruct = nullptr;
+    else
+      currentConstruct = currentConstruct->getParentOp();
+  }
+  return indices;
+}
+
 /// Return a name for temporary storage that indicates in which context
 /// the temporary storage was created.
 static llvm::StringRef
@@ -1160,20 +1203,35 @@ void OrderedAssignmentRewriter::generateSaveEntity(
     bool loopShapeCanBePreComputed =
         currentLoopNestIterationNumberCanBeComputed(loopNest);
     doBeforeLoopNest([&] {
-      /// For simple scalars inside loops whose total iteration number can be
-      /// pre-computed, create a rank-1 array outside of the loops. It will be
-      /// assigned/fetched inside the loops like a normal Fortran array given
-      /// the iteration count.
+      // For simple scalars in a precomputable loop nest, prefer the
+      // multidimensional ArrayTemp (indexed by loop induction variables) so
+      // there is no loop-carried counter. Fall back to the 1D counter-based
+      // HomogeneousScalarStack when the nest is deeper than the maximum
+      // fir.array rank.
       if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
-        mlir::Value loopExtent =
-            computeLoopNestIterationNumber(loc, builder, loopNest);
-        auto sequenceType =
-            mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
-        temp = insertSavedEntity(region,
-                                 fir::factory::HomogeneousScalarStack{
-                                     loc, builder, sequenceType, loopExtent,
-                                     /*lenParams=*/{}, allocateOnHeap,
-                                     /*stackThroughLoops=*/true, tempName});
+        if (loopNest.size() <= static_cast<size_t>(Fortran::common::maxRank)) {
+          llvm::SmallVector<mlir::Value> tempExtents;
+          llvm::SmallVector<mlir::Value> tempLowerBounds;
+          computeLoopNestExtentsAndLowerBounds(loc, builder, loopNest,
+                                               tempExtents, tempLowerBounds);
+          auto sequenceType = mlir::cast<fir::SequenceType>(
+              builder.getVarLenSeqTy(entityType, /*rank=*/loopNest.size()));
+          temp = insertSavedEntity(
+              region,
+              fir::factory::ArrayTemp{loc, builder, sequenceType, tempExtents,
+                                      tempLowerBounds, /*lengths=*/{},
+                                      allocateOnHeap, tempName});
+        } else {
+          mlir::Value loopExtent =
+              computeLoopNestIterationNumber(loc, builder, loopNest);
+          auto sequenceType =
+              mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
+          temp = insertSavedEntity(region,
+                                   fir::factory::HomogeneousScalarStack{
+                                       loc, builder, sequenceType, loopExtent,
+                                       /*lenParams=*/{}, allocateOnHeap,
+                                       /*stackThroughLoops=*/true, tempName});
+        }
 
       } else {
         // If the number of iteration is not known, or if the values at each
@@ -1185,8 +1243,8 @@ void OrderedAssignmentRewriter::generateSaveEntity(
       }
     });
     // Inside the loop nest (and any fir.if if there are active masks), copy
-    // the value to the temp and do clean-ups for the value if any.
-    temp->pushValue(loc, builder, entity);
+    // the value to the temp and do clean-ups of the value if any.
+    temp->pushValue(loc, builder, entity, getLoopIndices());
   }
 
   // Delay the clean-up if the entity will be used in the same run (i.e., the
diff --git a/flang/test/HLFIR/order_assignments/array-temp-many-forall.f90 b/flang/test/HLFIR/order_assignments/array-temp-many-forall.f90
new file mode 100644
index 0000000000000..0078cf4e5a446
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/array-temp-many-forall.f90
@@ -0,0 +1,45 @@
+! Test that the lower-hlfir-ordered-assignments pass falls back to the
+! 1D HomogeneousScalarStack temporary (counter-based) when the FORALL loop
+! nest is deeper than Fortran::common::maxRank (15), because fir.array can
+! only hold up to maxRank dimensions.
+!
+! Below maxRank, the new ArrayTemp is used and there is no counter; here we
+! verify the opposite: the counter (a fir.alloca index, fir.load/addi/store
+! pattern) is restored when the loop nest has 16 levels.
+!
+! The test uses a rank-8 array of derived type with a rank-8 array component
+! to spread 16 indexable dimensions across the FORALL header.
+!
+! RUN: bbc -emit-hlfir -o - %s | fir-opt --lower-hlfir-ordered-assignments | FileCheck %s
+
+module many_forall_mod
+  type :: t
+    real :: c(2,2,2,2,2,2,2,2)
+  end type
+contains
+  subroutine more_than_15_forall(a)
+    type(t), intent(inout) :: a(2,2,2,2,2,2,2,2)
+    forall (i1=1:2, i2=1:2, i3=1:2, i4=1:2, i5=1:2, i6=1:2, i7=1:2, i8=1:2, &
+            j1=1:2, j2=1:2, j3=1:2, j4=1:2, j5=1:2, j6=1:2, j7=1:2, j8=1:2)
+      a(i1,i2,i3,i4,i5,i6,i7,i8)%c(j1,j2,j3,j4,j5,j6,j7,j8) = &
+        a(3-i1,3-i2,3-i3,3-i4,3-i5,3-i6,3-i7,3-i8)%c(3-j1,3-j2,3-j3,3-j4,3-j5,3-j6,3-j7,3-j8)
+    end forall
+  end subroutine
+end module
+! With 16 nested loops, the temporary must be the 1D counter-based form
+! (HomogeneousScalarStack) instead of a 16D ArrayTemp, since fir.array is
+! limited to Fortran::common::maxRank dimensions.
+!
+! CHECK-LABEL: func.func @_QMmany_forall_modPmore_than_15_forall(
+! There must be a counter in memory (fir.alloca index).
+! CHECK:         %[[CTR:.*]] = fir.alloca index
+! The temporary is a 1D fir.array<?xf32>.
+! CHECK:         %[[ALLOC:.*]] = fir.allocmem !fir.array<?xf32>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
+! Plain fir.shape (no shift), since the temp is indexed by the counter.
+! CHECK:         %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:         hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! Inside the loop nest the counter is incremented and the temp is indexed
+! through the counter (not directly through the loop induction variables).
+! CHECK:         fir.load %[[CTR]] : !fir.ref<index>
+! CHECK:         arith.addi %{{.*}}, %{{.*}} : index
+! CHECK:         fir.store %{{.*}} to %[[CTR]] : !fir.ref<index>
diff --git a/flang/test/HLFIR/order_assignments/array-temp.fir b/flang/test/HLFIR/order_assignments/array-temp.fir
new file mode 100644
index 0000000000000..5510d0220397a
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/array-temp.fir
@@ -0,0 +1,112 @@
+// Test that hlfir.where/hlfir.forall temporary storages used for simple
+// scalar values are lowered to a multidimensional ArrayTemp directly indexed
+// by the enclosing loop induction variables (using hlfir.designate on a
+// fir.shape_shift) rather than to a 1D HomogeneousScalarStack with a counter.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments | FileCheck %s
+
+// Single-dimension case: the saved RHS is stored in a 1D ArrayTemp indexed
+// by the where loop induction variable, with a fir.shape_shift instead of a
+// fir.shape.
+func.func @where_self_overlap(%x: !fir.ref<!fir.array<10xi32>>, %mask: !fir.ref<!fir.array<10x!fir.logical<4>>>) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  hlfir.where {
+    hlfir.yield %mask : !fir.ref<!fir.array<10x!fir.logical<4>>>
+  } do {
+    hlfir.region_assign {
+      %2 = hlfir.designate %x (%c10:%c1:%c-1)  shape %1 :
+(!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+      hlfir.yield %2 : !fir.ref<!fir.array<10xi32>>
+    } to {
+      hlfir.yield %x : !fir.ref<!fir.array<10xi32>>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @where_self_overlap(
+// CHECK-NOT:       fir.alloca index
+// CHECK:           %[[ALLOC:.*]] = fir.allocmem !fir.array<?xi32>, %{{.*}} {bindc_name = ".tmp.where", uniq_name = ""}
+// CHECK:           %[[SHAPE:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.where"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// First loop: save the RHS values, addressed by the loop induction variable.
+// CHECK:           fir.do_loop %[[IV0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             fir.if %{{.*}} {
+// CHECK-NOT:           fir.load %{{.*}} : !fir.ref<index>
+// CHECK:               %[[ADDR0:.*]] = hlfir.designate %[[DECL]]#0 (%[[IV0]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %{{.*}} to %[[ADDR0]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// Second loop: read back from the temp using the new induction variable.
+// CHECK:           fir.do_loop %[[IV1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             fir.if %{{.*}} {
+// CHECK:               %[[ADDR1:.*]] = hlfir.designate %[[DECL]]#0 (%[[IV1]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL:.*]] = fir.load %[[ADDR1]] : !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL]] to %{{.*}} : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           return
+
+// Two nested forall loops with non-trivial lower bounds: the temp is a 2D
+// ArrayTemp whose first dimension corresponds to the inner forall and second
+// dimension to the outer one. The fir.shape_shift carries the forall lower
+// bounds, and hlfir.designate uses both induction variables (innermost first).
+func.func @nested_forall_2d(%arr: !fir.box<!fir.array<?x?xi32>>) {
+  %c2_i32 = arith.constant 2 : i32
+  %c5_i32 = arith.constant 5 : i32
+  %c3_i32 = arith.constant 3 : i32
+  %c7_i32 = arith.constant 7 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0:2 = hlfir.declare %arr {uniq_name = "x"} : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+  hlfir.forall lb {
+    hlfir.yield %c2_i32 : i32
+  } ub {
+    hlfir.yield %c5_i32 : i32
+  }  (%i: i32) {
+    hlfir.forall lb {
+      hlfir.yield %c3_i32 : i32
+    } ub {
+      hlfir.yield %c7_i32 : i32
+    }  (%j: i32) {
+      hlfir.region_assign {
+        %i_idx = fir.convert %i : (i32) -> i64
+        %j_idx = fir.convert %j : (i32) -> i64
+        %addr = hlfir.designate %0#0 (%i_idx, %j_idx)  : (!fir.box<!fir.array<?x?xi32>>, i64, i64) -> !fir.ref<i32>
+        %val = fir.load %addr : !fir.ref<i32>
+        hlfir.yield %val : i32
+      } to {
+        %i_inv = arith.subi %c5_i32, %i : i32
+        %i_inv2 = arith.addi %i_inv, %c2_i32 : i32
+        %j_inv = arith.subi %c7_i32, %j : i32
+        %j_inv2 = arith.addi %j_inv, %c3_i32 : i32
+        %i_idx = fir.convert %i_inv2 : (i32) -> i64
+        %j_idx = fir.convert %j_inv2 : (i32) -> i64
+        %addr = hlfir.designate %0#0 (%i_idx, %j_idx)  : (!fir.box<!fir.array<?x?xi32>>, i64, i64) -> !fir.ref<i32>
+        hlfir.yield %addr : !fir.ref<i32>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @nested_forall_2d(
+// CHECK-NOT:       fir.alloca index
+// CHECK:           %[[ALLOC:.*]] = fir.allocmem !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[SHAPE:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:           %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?x?xi32>>, !fir.shapeshift<2>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.heap<!fir.array<?x?xi32>>)
+// CHECK:           fir.do_loop %[[I0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             fir.do_loop %[[J0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:               %[[ADDR0:.*]] = hlfir.designate %[[DECL]]#0 (%[[J0]], %[[I0]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %{{.*}} to %[[ADDR0]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           fir.do_loop %[[I1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             fir.do_loop %[[J1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:               %[[ADDR1:.*]] = hlfir.designate %[[DECL]]#0 (%[[J1]], %[[I1]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:               %[[V:.*]] = fir.load %[[ADDR1]] : !fir.ref<i32>
+// CHECK:               hlfir.assign %[[V]] to %{{.*}} : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<?x?xi32>>
+// CHECK:           return
diff --git a/flang/test/HLFIR/order_assignments/impure-where.fir b/flang/test/HLFIR/order_assignments/impure-where.fir
index 011a486b2baf7..c26b007e1973f 100644
--- a/flang/test/HLFIR/order_assignments/impure-where.fir
+++ b/flang/test/HLFIR/order_assignments/impure-where.fir
@@ -38,22 +38,22 @@ func.func @test_elsewhere_impure_mask(%x: !fir.ref<!fir.array<10xi32>>, %y: !fir
 // CHECK-LABEL:   func.func @test_elsewhere_impure_mask(
 // CHECK:           %[[VAL_12:.*]] = fir.call @impure() : () -> !fir.heap<!fir.array<10x!fir.logical<4>>>
 // CHECK:           %[[VAL_21:.*]] = fir.allocmem !fir.array<?x!fir.logical<4>>, %[[extent:[^ ]*]]
-// CHECK:           %[[VAL_22:.*]] = fir.shape %[[extent]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%{{.*}}) {uniq_name = ".tmp.where"}
-// CHECK:           fir.do_loop
+// CHECK:           %[[SHAPE:.*]] = fir.shape_shift %{{.*}}, %[[extent]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%[[SHAPE]]) {uniq_name = ".tmp.where"}
+// CHECK:           fir.do_loop %[[IV0:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             fir.if {{.*}} {
 // CHECK:             } else {
 // CHECK:               %[[VAL_28:.*]] = hlfir.designate %[[VAL_12]] (%{{.*}})
 // CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<!fir.logical<4>>
-// CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_23]]#0 (%[[IV0]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
 // CHECK:               hlfir.assign %[[VAL_29]] to %[[VAL_32]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 // CHECK:             }
 // CHECK:           }
 // CHECK-NOT:       fir.call @impure
-// CHECK:           fir.do_loop
+// CHECK:           fir.do_loop %[[IV1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             fir.if {{.*}} {
 // CHECK:             } else {
-// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_23]]#0 (%[[IV1]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
 // CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
 // CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
 // CHECK:               fir.if %[[VAL_44]] {
@@ -61,10 +61,10 @@ func.func @test_elsewhere_impure_mask(%x: !fir.ref<!fir.array<10xi32>>, %y: !fir
 // CHECK:             }
 // CHECK:           }
 // CHECK-NOT:       fir.call @impure
-// CHECK:           fir.do_loop
+// CHECK:           fir.do_loop %[[IV2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             fir.if {{.*}} {
 // CHECK:             } else {
-// CHECK:               %[[VAL_52:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_52:.*]] = hlfir.designate %[[VAL_23]]#0 (%[[IV2]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
 // CHECK:               %[[VAL_53:.*]] = fir.load %[[VAL_52]] : !fir.ref<!fir.logical<4>>
 // CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_53]] : (!fir.logical<4>) -> i1
 // CHECK:               fir.if %[[VAL_54]] {
diff --git a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
index 064b12b9ed812..6eac74e23053e 100644
--- a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
+++ b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
@@ -27,53 +27,42 @@ func.func @test_scalar_save(%arg0: !fir.box<!fir.array<?xi32>>) {
 }
 // CHECK-LABEL:   func.func @test_scalar_save(
 // CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
-// CHECK:           %[[VAL_1:.*]] = fir.alloca index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 10 : i32
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
-// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_6]], %[[VAL_5]] : index
-// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_7]] : index
-// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_7]] : index
-// CHECK:           %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_8]] : index
-// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : index
-// CHECK:           %[[VAL_14:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_15:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_14]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:           %[[VAL_16:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_13]] {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_16]](%[[VAL_17]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-// CHECK:           fir.do_loop %[[VAL_19:.*]] = %[[VAL_5]] to %[[VAL_6]] step %[[VAL_7]] {
-// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (index) -> i32
-// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-// CHECK:             %[[VAL_22:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_21]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-// CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<i32>
-// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_15]] : index
-// CHECK:             fir.store %[[VAL_25]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:             %[[VAL_26:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_24]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             hlfir.assign %[[VAL_23]] to %[[VAL_26]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (i32) -> index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_5]], %[[VAL_4]] : index
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.divsi %[[VAL_9]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_12]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[VAL_14:.*]] = fir.shape_shift %[[VAL_4]], %[[VAL_12]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_14]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_16:.*]] = %[[VAL_4]] to %[[VAL_5]] step %[[VAL_6]] {
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (index) -> i32
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i32) -> i64
+// CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_18]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<i32>
+// CHECK:             %[[VAL_21:.*]] = hlfir.designate %[[VAL_15]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_20]] to %[[VAL_21]] : i32, !fir.ref<i32>
 // CHECK:           }
-// CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
-// CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
-// CHECK:           %[[VAL_29:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_14]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:           fir.do_loop %[[VAL_30:.*]] = %[[VAL_27]] to %[[VAL_28]] step %[[VAL_29]] {
-// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (index) -> i32
-// CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[VAL_15]] : index
-// CHECK:             fir.store %[[VAL_33]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_32]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
-// CHECK:             %[[VAL_36:.*]] = arith.addi %[[VAL_31]], %[[VAL_3]] : i32
-// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> i64
-// CHECK:             %[[VAL_38:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_37]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-// CHECK:             hlfir.assign %[[VAL_35]] to %[[VAL_38]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_1]] : (i32) -> index
+// CHECK:           %[[VAL_24:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_25:.*]] = %[[VAL_22]] to %[[VAL_23]] step %[[VAL_24]] {
+// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (index) -> i32
+// CHECK:             %[[VAL_27:.*]] = hlfir.designate %[[VAL_15]]#0 (%[[VAL_25]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref<i32>
+// CHECK:             %[[VAL_29:.*]] = arith.addi %[[VAL_26]], %[[VAL_2]] : i32
+// CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i32) -> i64
+// CHECK:             %[[VAL_31:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_30]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_28]] to %[[VAL_31]] : i32, !fir.ref<i32>
 // CHECK:           }
-// CHECK:           fir.freemem %[[VAL_16]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           fir.freemem %[[VAL_13]] : !fir.heap<!fir.array<?xi32>>
 // CHECK:           return
 // CHECK:         }
 
@@ -111,87 +100,65 @@ func.func @mask_and_rhs_conflict(%arg0: !fir.box<!fir.array<?xi32>>) {
 }
 // CHECK-LABEL:   func.func @mask_and_rhs_conflict(
 // CHECK-SAME:                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
-// CHECK:           %[[VAL_1:.*]] = fir.alloca index
-// CHECK:           %[[VAL_2:.*]] = fir.alloca index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i32
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (i32) -> index
-// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
-// CHECK:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_11:.*]] = arith.subi %[[VAL_8]], %[[VAL_7]] : index
-// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_13:.*]] = arith.divsi %[[VAL_12]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13]], %[[VAL_10]] : index
-// CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_13]], %[[VAL_10]] : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_17:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_16]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:           %[[VAL_18:.*]] = fir.allocmem !fir.array<?xi1>, %[[VAL_15]] {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:           %[[VAL_19:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_19]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi1>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi1>>, !fir.heap<!fir.array<?xi1>>)
-// CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_22:.*]] = arith.subi %[[VAL_8]], %[[VAL_7]] : index
-// CHECK:           %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_24:.*]] = arith.divsi %[[VAL_23]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_21]] : index
-// CHECK:           %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_24]], %[[VAL_21]] : index
-// CHECK:           %[[VAL_27:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_28:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_27]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:           %[[VAL_29:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_26]] {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_26]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_29]](%[[VAL_30]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-// CHECK:           fir.do_loop %[[VAL_32:.*]] = %[[VAL_7]] to %[[VAL_8]] step %[[VAL_9]] {
-// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (index) -> i32
-// CHECK:             %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i32) -> i64
-// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_34]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-// CHECK:             %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<i32>
-// CHECK:             %[[VAL_37:.*]] = arith.cmpi sgt, %[[VAL_36]], %[[VAL_3]] : i32
-// CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
-// CHECK:             %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_17]] : index
-// CHECK:             fir.store %[[VAL_39]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:             %[[VAL_40:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
-// CHECK:             hlfir.assign %[[VAL_37]] to %[[VAL_40]] : i1, !fir.ref<i1>
-// CHECK:             fir.if %[[VAL_37]] {
-// CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_33]] : (i32) -> i64
-// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_41]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-// CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<i32>
-// CHECK:               %[[VAL_44:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:               %[[VAL_45:.*]] = arith.addi %[[VAL_44]], %[[VAL_28]] : index
-// CHECK:               fir.store %[[VAL_45]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:               %[[VAL_46:.*]] = hlfir.designate %[[VAL_31]]#0 (%[[VAL_44]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:               hlfir.assign %[[VAL_43]] to %[[VAL_46]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_6]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : index
+// CHECK:           %[[MASK_TEMP:.*]] = fir.allocmem !fir.array<?xi1>, %[[VAL_13]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[MASK_SHAPE:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[MASK_DECL:.*]]:2 = hlfir.declare %[[MASK_TEMP]](%[[MASK_SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi1>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi1>>, !fir.heap<!fir.array<?xi1>>)
+// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_17:.*]] = arith.subi %[[VAL_6]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_17]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_19:.*]] = arith.divsi %[[VAL_18]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_19]], %[[VAL_16]] : index
+// CHECK:           %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_19]], %[[VAL_16]] : index
+// CHECK:           %[[RHS_TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_21]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[RHS_SHAPE:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_21]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[RHS_DECL:.*]]:2 = hlfir.declare %[[RHS_TEMP]](%[[RHS_SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_24:.*]] = %[[VAL_5]] to %[[VAL_6]] step %[[VAL_7]] {
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (index) -> i32
+// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i32) -> i64
+// CHECK:             %[[VAL_27:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_26]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref<i32>
+// CHECK:             %[[VAL_29:.*]] = arith.cmpi sgt, %[[VAL_28]], %[[VAL_1]] : i32
+// CHECK:             %[[VAL_30:.*]] = hlfir.designate %[[MASK_DECL]]#0 (%[[VAL_24]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:             hlfir.assign %[[VAL_29]] to %[[VAL_30]] : i1, !fir.ref<i1>
+// CHECK:             fir.if %[[VAL_29]] {
+// CHECK:               %[[VAL_31:.*]] = fir.convert %[[VAL_25]] : (i32) -> i64
+// CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_31]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:               %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// CHECK:               %[[VAL_34:.*]] = hlfir.designate %[[RHS_DECL]]#0 (%[[VAL_24]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_33]] to %[[VAL_34]] : i32, !fir.ref<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_47:.*]] = fir.convert %[[VAL_5]] : (i32) -> index
-// CHECK:           %[[VAL_48:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
-// CHECK:           %[[VAL_49:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_16]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:           fir.store %[[VAL_27]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:           fir.do_loop %[[VAL_50:.*]] = %[[VAL_47]] to %[[VAL_48]] step %[[VAL_49]] {
-// CHECK:             %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (index) -> i32
-// CHECK:             %[[VAL_52:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
-// CHECK:             %[[VAL_53:.*]] = arith.addi %[[VAL_52]], %[[VAL_17]] : index
-// CHECK:             fir.store %[[VAL_53]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:             %[[VAL_54:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_52]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
-// CHECK:             %[[VAL_55:.*]] = fir.load %[[VAL_54]] : !fir.ref<i1>
-// CHECK:             fir.if %[[VAL_55]] {
-// CHECK:               %[[VAL_56:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:               %[[VAL_57:.*]] = arith.addi %[[VAL_56]], %[[VAL_28]] : index
-// CHECK:               fir.store %[[VAL_57]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:               %[[VAL_58:.*]] = hlfir.designate %[[VAL_31]]#0 (%[[VAL_56]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:               %[[VAL_59:.*]] = fir.load %[[VAL_58]] : !fir.ref<i32>
-// CHECK:               %[[VAL_60:.*]] = arith.addi %[[VAL_51]], %[[VAL_5]] : i32
-// CHECK:               %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (i32) -> i64
-// CHECK:               %[[VAL_62:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_61]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-// CHECK:               hlfir.assign %[[VAL_59]] to %[[VAL_62]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_35:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
+// CHECK:           %[[VAL_36:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_37:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_38:.*]] = %[[VAL_35]] to %[[VAL_36]] step %[[VAL_37]] {
+// CHECK:             %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (index) -> i32
+// CHECK:             %[[VAL_40:.*]] = hlfir.designate %[[MASK_DECL]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:             %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<i1>
+// CHECK:             fir.if %[[VAL_41]] {
+// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[RHS_DECL]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<i32>
+// CHECK:               %[[VAL_44:.*]] = arith.addi %[[VAL_39]], %[[VAL_3]] : i32
+// CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_44]] : (i32) -> i64
+// CHECK:               %[[VAL_46:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_43]] to %[[VAL_46]] : i32, !fir.ref<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK-DAG:       fir.freemem %[[VAL_18]] : !fir.heap<!fir.array<?xi1>>
-// CHECK-DAG:       fir.freemem %[[VAL_29]] : !fir.heap<!fir.array<?xi32>>
+// CHECK-DAG:       fir.freemem %[[MASK_TEMP]] : !fir.heap<!fir.array<?xi1>>
+// CHECK-DAG:       fir.freemem %[[RHS_TEMP]] : !fir.heap<!fir.array<?xi32>>
 // CHECK:           return
 // CHECK:         }
 
@@ -224,6 +191,8 @@ func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
   }
   return
 }
+// This case uses hlfir.associate (SimpleCopy) to save the whole mask outside
+// of the where loop nest, which is unrelated to the new ArrayTemp path.
 // CHECK-LABEL:   func.func @test_where_mask_save(
 // CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
@@ -278,59 +247,48 @@ func.func @test_where_rhs_save(%x: !fir.ref<!fir.array<10xi32>>, %mask: !fir.ref
 // CHECK-LABEL:   func.func @test_where_rhs_save(
 // CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>,
 // CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>) {
-// CHECK:           %[[VAL_2:.*]] = fir.alloca index
-// CHECK:           %[[VAL_3:.*]] = arith.constant -1 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
-// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
-// CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_11:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_12:.*]] = arith.subi %[[VAL_7]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_14:.*]] = arith.divsi %[[VAL_13]], %[[VAL_9]] : index
-// CHECK:           %[[VAL_15:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_11]] : index
-// CHECK:           %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_14]], %[[VAL_11]] : index
-// CHECK:           %[[VAL_17:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_18:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_17]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:           %[[VAL_19:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_16]] {bindc_name = ".tmp.where", uniq_name = ""}
-// CHECK:           %[[VAL_20:.*]] = fir.shape %[[VAL_16]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_19]](%[[VAL_20]]) {uniq_name = ".tmp.where"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-// CHECK:           fir.do_loop %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_9]] {
-// CHECK:             %[[VAL_23:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_22]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
-// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref<!fir.logical<4>>
-// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.logical<4>) -> i1
-// CHECK:             fir.if %[[VAL_25]] {
-// CHECK:           %[[VAL_10:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_5]]:%[[VAL_4]]:%[[VAL_3]])  shape %[[VAL_6]] : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
-// CHECK:               %[[VAL_26:.*]] = hlfir.designate %[[VAL_10]] (%[[VAL_22]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
-// CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_26]] : !fir.ref<i32>
-// CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
-// CHECK:               %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_18]] : index
-// CHECK:               fir.store %[[VAL_29]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:               %[[VAL_30:.*]] = hlfir.designate %[[VAL_21]]#0 (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:               hlfir.assign %[[VAL_27]] to %[[VAL_30]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_2:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_9:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_10:.*]] = arith.subi %[[VAL_6]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.divsi %[[VAL_11]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_15:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_14]] {bindc_name = ".tmp.where", uniq_name = ""}
+// CHECK:           %[[VAL_16:.*]] = fir.shape_shift %[[VAL_8]], %[[VAL_14]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) {uniq_name = ".tmp.where"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_18:.*]] = %[[VAL_8]] to %[[VAL_6]] step %[[VAL_8]] {
+// CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_18]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_21]] {
+// CHECK:               %[[SLICE:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]]:%[[VAL_3]]:%[[VAL_2]])  shape %[[VAL_5]] : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+// CHECK:               %[[VAL_22:.*]] = hlfir.designate %[[SLICE]] (%[[VAL_18]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<i32>
+// CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_18]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_23]] to %[[VAL_24]] : i32, !fir.ref<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_31:.*]] = arith.constant 10 : index
-// CHECK:           %[[VAL_32:.*]] = fir.shape %[[VAL_31]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_33:.*]] = arith.constant 1 : index
-// CHECK:           fir.store %[[VAL_17]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:           fir.do_loop %[[VAL_34:.*]] = %[[VAL_33]] to %[[VAL_31]] step %[[VAL_33]] {
-// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_34]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
-// CHECK:             %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-// CHECK:             fir.if %[[VAL_37]] {
-// CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
-// CHECK:               %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_18]] : index
-// CHECK:               fir.store %[[VAL_39]] to %[[VAL_2]] : !fir.ref<index>
-// CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_21]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<i32>
-// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
-// CHECK:               hlfir.assign %[[VAL_41]] to %[[VAL_42]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_25:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_26:.*]] = fir.shape %[[VAL_25]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_27:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_28:.*]] = %[[VAL_27]] to %[[VAL_25]] step %[[VAL_27]] {
+// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_28]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_31]] {
+// CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// CHECK:               %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_33]] to %[[VAL_34]] : i32, !fir.ref<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           fir.freemem %[[VAL_19]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           fir.freemem %[[VAL_15]] : !fir.heap<!fir.array<?xi32>>
 // CHECK:           return
 // CHECK:         }
diff --git a/flang/test/HLFIR/order_assignments/saving-mask-and-rhs.fir b/flang/test/HLFIR/order_assignments/saving-mask-and-rhs.fir
index 1eb86d09a39c1..3fe6ab7e087ad 100644
--- a/flang/test/HLFIR/order_assignments/saving-mask-and-rhs.fir
+++ b/flang/test/HLFIR/order_assignments/saving-mask-and-rhs.fir
@@ -42,13 +42,14 @@ func.func @saving_mask_and_rhs(%arg0: !fir.ref<!fir.array<10xi32>>) {
 // Creating RHS temporary using the mask temporary (and not the hlfir.elemental)
 
 // CHECK:  %[[VAL_25:.*]] = fir.allocmem !fir.array<?xi32>, %{{.*}} {bindc_name = ".tmp.where", uniq_name = ""}
-// CHECK:  %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_25]]({{.*}}) {uniq_name = ".tmp.where"}
-// CHECK:  fir.do_loop
+// CHECK:  %[[RHS_SHAPE:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+// CHECK:  %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_25]](%[[RHS_SHAPE]]) {uniq_name = ".tmp.where"}
+// CHECK:  fir.do_loop %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:    %[[VAL_29:.*]] = hlfir.designate %[[VAL_14]]#0 ({{.*}})
 // CHECK:    %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<!fir.logical<4>>
 // CHECK:    %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
 // CHECK:    fir.if %[[VAL_31]] {
-// CHECK:      %[[VAL_36:.*]] = hlfir.designate %[[VAL_27]]#0 ({{.*}})
+// CHECK:      %[[VAL_36:.*]] = hlfir.designate %[[VAL_27]]#0 (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:      hlfir.assign %{{.*}} to %[[VAL_36]] : i32, !fir.ref<i32>
 // CHECK:    }
 // CHECK:  }
@@ -91,14 +92,17 @@ func.func @forall_mask_and_rhs(%arg0: !fir.ref<!fir.array<10xi32>>) {
 
 // CHECK-LABEL:   func.func @forall_mask_and_rhs(
 // CHECK:           %[[VAL_18:.*]] = fir.allocmem !fir.array<?xi1>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%{{.*}}) {uniq_name = ".tmp.forall"}
+// CHECK:           %[[MASK_SHAPE:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[MASK_SHAPE]]) {uniq_name = ".tmp.forall"}
 // CHECK:           %[[VAL_29:.*]] = fir.allocmem !fir.array<?xi32>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:           %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_29]](%{{.*}}) {uniq_name = ".tmp.forall"}
+// CHECK:           %[[RHS_SHAPE:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_29]](%[[RHS_SHAPE]]) {uniq_name = ".tmp.forall"}
+// CHECK:           fir.do_loop %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:             %[[VAL_36:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} : i32
-// CHECK:             %[[VAL_39:.*]] = hlfir.designate %[[VAL_20]]#0 (%{{.*}})
+// CHECK:             %[[VAL_39:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[IV]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
 // CHECK:             hlfir.assign %[[VAL_36]] to %[[VAL_39]] : i1, !fir.ref<i1>
 // CHECK:             fir.if %[[VAL_36]] {
-// CHECK:               %[[VAL_45:.*]] = hlfir.designate %[[VAL_31]]#0 (%{{.*}})
+// CHECK:               %[[VAL_45:.*]] = hlfir.designate %[[VAL_31]]#0 (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:               hlfir.assign %{{.*}} to %[[VAL_45]] : i32, !fir.ref<i32>
 // CHECK:             }
 // CHECK:           }
diff --git a/flang/test/HLFIR/order_assignments/user-defined-assignment.fir b/flang/test/HLFIR/order_assignments/user-defined-assignment.fir
index e64b3ef362cee..83411279d0731 100644
--- a/flang/test/HLFIR/order_assignments/user-defined-assignment.fir
+++ b/flang/test/HLFIR/order_assignments/user-defined-assignment.fir
@@ -138,7 +138,6 @@ func.func @test_scalar_forall_overlap(%i: !fir.ref<!fir.array<10xi32>>) {
 }
 // CHECK-LABEL:   func.func @test_scalar_forall_overlap(
 // CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>) {
-// CHECK:  %[[VAL_1:.*]] = fir.alloca index
 // CHECK:  %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:  %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK:  %[[VAL_4:.*]] = arith.constant 10 : index
@@ -150,30 +149,20 @@ func.func @test_scalar_forall_overlap(%i: !fir.ref<!fir.array<10xi32>>) {
 // CHECK:  %[[VAL_10:.*]] = arith.divsi %[[VAL_9]], %[[VAL_6]] : index
 // CHECK:  %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_7]] : index
 // CHECK:  %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_7]] : index
-// CHECK:  %[[VAL_13:.*]] = arith.constant 1 : index
-// CHECK:  %[[VAL_14:.*]] = arith.constant 1 : index
-// CHECK:  fir.store %[[VAL_13]] to %[[VAL_1]] : !fir.ref<index>
 // CHECK:  %[[VAL_15:.*]] = fir.allocmem !fir.array<?xi1>, %[[VAL_12]] {bindc_name = ".tmp.forall", uniq_name = ""}
-// CHECK:  %[[VAL_16:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
-// CHECK:  %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi1>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi1>>, !fir.heap<!fir.array<?xi1>>)
+// CHECK:  %[[VAL_16:.*]] = fir.shape_shift %[[VAL_3]], %[[VAL_12]] : (index, index) -> !fir.shapeshift<1>
+// CHECK:  %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi1>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi1>>, !fir.heap<!fir.array<?xi1>>)
 // CHECK:  fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_4]] step %[[VAL_6]] {
 // CHECK:    %[[VAL_19:.*]] = arith.subi %[[VAL_5]], %[[VAL_18]] : index
 // CHECK:    %[[VAL_20:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_19]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
 // CHECK:    %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<i32>
 // CHECK:    %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_2]] : i32
-// CHECK:    %[[VAL_23:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:    %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_14]] : index
-// CHECK:    fir.store %[[VAL_24]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:    %[[VAL_25:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_23]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:    %[[VAL_25:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_18]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
 // CHECK:    hlfir.assign %[[VAL_22]] to %[[VAL_25]] : i1, !fir.ref<i1>
 // CHECK:  }
 // CHECK:  %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:  fir.store %[[VAL_13]] to %[[VAL_1]] : !fir.ref<index>
 // CHECK:  fir.do_loop %[[VAL_27:.*]] = %[[VAL_3]] to %[[VAL_4]] step %[[VAL_26]] {
-// CHECK:    %[[VAL_28:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
-// CHECK:    %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_14]] : index
-// CHECK:    fir.store %[[VAL_29]] to %[[VAL_1]] : !fir.ref<index>
-// CHECK:    %[[VAL_30:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_28]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:    %[[VAL_30:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_27]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
 // CHECK:    %[[VAL_31:.*]] = fir.load %[[VAL_30]] : !fir.ref<i1>
 // CHECK:    %[[VAL_32:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_27]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
 // CHECK:    %[[VAL_33:.*]] = fir.convert %[[VAL_31]] : (i1) -> !fir.logical<4>

>From 5ae60ab61a93cf2ae18046a98c349eccdbdad9ba Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier at nvidia.com>
Date: Thu, 30 Apr 2026 07:13:34 -0700
Subject: [PATCH 2/2] do not use new temps with forall with non unit strides

---
 .../LowerHLFIROrderedAssignments.cpp          | 85 ++++++++++-------
 .../HLFIR/order_assignments/array-temp.fir    | 95 +++++++++++++++++++
 2 files changed, 147 insertions(+), 33 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 08a43e476c0cd..5ad69c2b5cafe 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -260,7 +260,7 @@ class OrderedAssignmentRewriter {
 
   /// Return the induction variables of the enclosing fir.do_loop nest at the
   /// current insertion point, innermost first (same order as
-  /// currentLoopNestIterationNumberCanBeComputed). Used to index ArrayTemp.
+  /// currentLoopNestIterationNumberCanBeComputed).
   llvm::SmallVector<mlir::Value> getLoopIndices();
 
   template <typename T>
@@ -672,8 +672,7 @@ OrderedAssignmentRewriter::getIfSaved(mlir::Region &region) {
   if (auto savedInSameRun = savedInCurrentRunBeforeUse.find(&region);
       savedInSameRun != savedInCurrentRunBeforeUse.end())
     return savedInSameRun->second;
-  // If the region was saved in a previous run, fetch the saved value. The
-  // loop indices are only used by ArrayTemp; the other temps ignore them.
+  // If the region was saved in a previous run, fetch the saved value.
   if (auto temp = savedEntities.find(&region); temp != savedEntities.end()) {
     doBeforeLoopNest([&]() { temp->second.resetFetchPosition(loc, builder); });
     return ValueAndCleanUp{temp->second.fetch(loc, builder, getLoopIndices()),
@@ -1117,22 +1116,42 @@ computeLoopNestIterationNumber(mlir::Location loc, fir::FirOpBuilder &builder,
   return loopExtent;
 }
 
+/// If \p value is a compile-time integer constant (possibly hidden behind
+/// fir.convert ops), return its value. Otherwise return std::nullopt.
+static std::optional<int64_t> unwrapConstantInt(mlir::Value value) {
+  while (auto convert = value.getDefiningOp<fir::ConvertOp>())
+    value = convert.getValue();
+  return fir::getIntIfConstant(value);
+}
+
 /// Compute the extents and lower bounds of \p loopNest, in the same order as
-/// \p loopNest (innermost first).
-static void computeLoopNestExtentsAndLowerBounds(
+/// \p loopNest (innermost first). The lower bound of each dimension is the
+/// smallest induction variable value, so that the loop induction variable
+/// can directly index the temp via fir.shape_shift. This only works when
+/// every loop has a unit step: for step +1 the smallest iv is the loop's
+/// lower bound; for step -1 it is the loop's upper bound. Returns false
+/// (with \p extents and \p lowerBounds left in an unspecified state) when
+/// any loop has a non-unit or non-constant step, signalling that the caller
+/// should fall back to a counter-based temp.
+static bool computeLoopNestExtentsAndLowerBounds(
     mlir::Location loc, fir::FirOpBuilder &builder,
     llvm::ArrayRef<fir::DoLoopOp> loopNest,
     llvm::SmallVectorImpl<mlir::Value> &extents,
     llvm::SmallVectorImpl<mlir::Value> &lowerBounds) {
-  extents.reserve(extents.size() + loopNest.size());
-  lowerBounds.reserve(lowerBounds.size() + loopNest.size());
+  extents.reserve(loopNest.size());
+  lowerBounds.reserve(loopNest.size());
   for (fir::DoLoopOp doLoop : loopNest) {
+    auto step = unwrapConstantInt(doLoop.getStep());
+    if (!step || std::abs(*step) != 1)
+      return false;
     mlir::Value extent = builder.genExtentFromTriplet(
         loc, doLoop.getLowerBound(), doLoop.getUpperBound(), doLoop.getStep(),
         builder.getIndexType());
     extents.push_back(extent);
-    lowerBounds.push_back(doLoop.getLowerBound());
+    lowerBounds.push_back(*step == 1 ? doLoop.getLowerBound()
+                                     : doLoop.getUpperBound());
   }
+  return true;
 }
 
 llvm::SmallVector<mlir::Value> OrderedAssignmentRewriter::getLoopIndices() {
@@ -1207,32 +1226,32 @@ void OrderedAssignmentRewriter::generateSaveEntity(
       // multidimensional ArrayTemp (indexed by loop induction variables) so
       // there is no loop-carried counter. Fall back to the 1D counter-based
       // HomogeneousScalarStack when the nest is deeper than the maximum
-      // fir.array rank.
-      if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
-        if (loopNest.size() <= static_cast<size_t>(Fortran::common::maxRank)) {
-          llvm::SmallVector<mlir::Value> tempExtents;
-          llvm::SmallVector<mlir::Value> tempLowerBounds;
+      // fir.array rank or when any loop has a non-unit/non-constant step
+      // (in which case the loop induction variable cannot index the temp
+      // directly).
+      llvm::SmallVector<mlir::Value> tempExtents;
+      llvm::SmallVector<mlir::Value> tempLowerBounds;
+      if (loopShapeCanBePreComputed && fir::isa_trivial(entityType) &&
+          loopNest.size() <= static_cast<size_t>(Fortran::common::maxRank) &&
           computeLoopNestExtentsAndLowerBounds(loc, builder, loopNest,
-                                               tempExtents, tempLowerBounds);
-          auto sequenceType = mlir::cast<fir::SequenceType>(
-              builder.getVarLenSeqTy(entityType, /*rank=*/loopNest.size()));
-          temp = insertSavedEntity(
-              region,
-              fir::factory::ArrayTemp{loc, builder, sequenceType, tempExtents,
-                                      tempLowerBounds, /*lengths=*/{},
-                                      allocateOnHeap, tempName});
-        } else {
-          mlir::Value loopExtent =
-              computeLoopNestIterationNumber(loc, builder, loopNest);
-          auto sequenceType =
-              mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
-          temp = insertSavedEntity(region,
-                                   fir::factory::HomogeneousScalarStack{
-                                       loc, builder, sequenceType, loopExtent,
-                                       /*lenParams=*/{}, allocateOnHeap,
-                                       /*stackThroughLoops=*/true, tempName});
-        }
-
+                                               tempExtents, tempLowerBounds)) {
+        auto sequenceType = mlir::cast<fir::SequenceType>(
+            builder.getVarLenSeqTy(entityType, /*rank=*/loopNest.size()));
+        temp = insertSavedEntity(
+            region,
+            fir::factory::ArrayTemp{loc, builder, sequenceType, tempExtents,
+                                    tempLowerBounds,
+                                    /*lengths=*/{}, allocateOnHeap, tempName});
+      } else if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
+        mlir::Value loopExtent =
+            computeLoopNestIterationNumber(loc, builder, loopNest);
+        auto sequenceType =
+            mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
+        temp = insertSavedEntity(region,
+                                 fir::factory::HomogeneousScalarStack{
+                                     loc, builder, sequenceType, loopExtent,
+                                     /*lenParams=*/{}, allocateOnHeap,
+                                     /*stackThroughLoops=*/true, tempName});
       } else {
         // If the number of iteration is not known, or if the values at each
         // iterations are values that may have different shape, type parameters
diff --git a/flang/test/HLFIR/order_assignments/array-temp.fir b/flang/test/HLFIR/order_assignments/array-temp.fir
index 5510d0220397a..75d7b7a781127 100644
--- a/flang/test/HLFIR/order_assignments/array-temp.fir
+++ b/flang/test/HLFIR/order_assignments/array-temp.fir
@@ -110,3 +110,98 @@ func.func @nested_forall_2d(%arr: !fir.box<!fir.array<?x?xi32>>) {
 // CHECK:           }
 // CHECK:           fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<?x?xi32>>
 // CHECK:           return
+
+// Forall with a negative unit step. The temp's lower bound must be the
+// loop's upper bound (the smaller end of the iv range), not the loop's
+// lower bound (which is the start, i.e. the largest iv value); otherwise
+// the loop induction variable falls below the temp's domain on every
+// iteration but the first.
+func.func @forall_negative_step(%arg0: !fir.ref<!fir.array<10xi32>>) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %c11 = arith.constant 11 : index
+  %0 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %1:2 = hlfir.declare %arg0(%0) {uniq_name = "x"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  hlfir.forall lb {
+    hlfir.yield %c10 : index
+  } ub {
+    hlfir.yield %c1 : index
+  } step {
+    hlfir.yield %c-1 : index
+  }  (%i: index) {
+    hlfir.region_assign {
+      %rev = arith.subi %c11, %i : index
+      %addr = hlfir.designate %1#0 (%rev)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      %val = fir.load %addr : !fir.ref<i32>
+      hlfir.yield %val : i32
+    } to {
+      %addr = hlfir.designate %1#0 (%i)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      hlfir.yield %addr : !fir.ref<i32>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @forall_negative_step(
+// CHECK-NOT:       fir.alloca index
+// The do_loop runs from the loop's lb (c10) down to its ub (c1) with step
+// c-1. The temp's lower bound is the loop's upper bound (c1), not its
+// lower bound, so all iv values [1..10] fit within the temp's domain.
+// CHECK:           %[[C_NEG1:.*]] = arith.constant -1 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C10:.*]] = arith.constant 10 : index
+// CHECK:           %[[ALLOC:.*]] = fir.allocmem !fir.array<?xi32>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[SHAPE:.*]] = fir.shape_shift %[[C1]], %{{.*}} : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[IV0:.*]] = %[[C10]] to %[[C1]] step %[[C_NEG1]] {
+// CHECK:             %[[ADDR0:.*]] = hlfir.designate %[[DECL]]#0 (%[[IV0]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %{{.*}} to %[[ADDR0]] : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           fir.do_loop %[[IV1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             %[[ADDR1:.*]] = hlfir.designate %[[DECL]]#0 (%[[IV1]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[V:.*]] = fir.load %[[ADDR1]] : !fir.ref<i32>
+// CHECK:             hlfir.assign %[[V]] to %{{.*}} : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           return
+
+// Forall with a non-unit step (step=2): the loop iv values are
+// non-contiguous (1, 3, 5, 7, 9), so the loop induction variable cannot
+// directly index a contiguous fir.shape_shift temp. We must fall back to
+// the 1D counter-based HomogeneousScalarStack.
+func.func @forall_non_unit_step(%arg0: !fir.ref<!fir.array<10xi32>>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c9 = arith.constant 9 : index
+  %c11 = arith.constant 11 : index
+  %0 = fir.shape %c9 : (index) -> !fir.shape<1>
+  %1:2 = hlfir.declare %arg0(%0) {uniq_name = "x"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1 : index
+  } ub {
+    hlfir.yield %c9 : index
+  } step {
+    hlfir.yield %c2 : index
+  }  (%i: index) {
+    hlfir.region_assign {
+      %rev = arith.subi %c11, %i : index
+      %addr = hlfir.designate %1#0 (%rev)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      %val = fir.load %addr : !fir.ref<i32>
+      hlfir.yield %val : i32
+    } to {
+      %addr = hlfir.designate %1#0 (%i)  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      hlfir.yield %addr : !fir.ref<i32>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @forall_non_unit_step(
+// Counter-based HomogeneousScalarStack: a fir.alloca index counter and a
+// plain fir.shape (no shift), with the temp indexed through the counter.
+// CHECK:           %[[CTR:.*]] = fir.alloca index
+// CHECK:           %[[ALLOC:.*]] = fir.allocmem !fir.array<?xi32>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+// CHECK:           hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.load %[[CTR]] : !fir.ref<index>
+// CHECK:           arith.addi %{{.*}}, %{{.*}} : index
+// CHECK:           fir.store %{{.*}} to %[[CTR]] : !fir.ref<index>



More information about the flang-commits mailing list