[flang-commits] [flang] d757811 - [flang][hlfir] Generate temporary storage in Forall/Where [1/2]

Jean Perier via flang-commits flang-commits at lists.llvm.org
Thu May 25 02:55:20 PDT 2023


Author: Jean Perier
Date: 2023-05-25T11:51:32+02:00
New Revision: d7578116b89fcffe8db4b2512ceda8c6fbf1ea7f

URL: https://github.com/llvm/llvm-project/commit/d7578116b89fcffe8db4b2512ceda8c6fbf1ea7f
DIFF: https://github.com/llvm/llvm-project/commit/d7578116b89fcffe8db4b2512ceda8c6fbf1ea7f.diff

LOG: [flang][hlfir] Generate temporary storage in Forall/Where [1/2]

Generate temporary storage inline inside WHERE and FORALL when possible.
A following patch will use the runtime to cover the generic cases.

Reviewed By: vzakhari

Differential Revision: https://reviews.llvm.org/D151247

Added: 
    flang/test/HLFIR/order_assignments/impure-where.fir
    flang/test/HLFIR/order_assignments/inlined-stack-temp.fir

Modified: 
    flang/include/flang/Optimizer/Builder/TemporaryStorage.h
    flang/lib/Optimizer/Builder/TemporaryStorage.cpp
    flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp

Removed: 
    flang/test/HLFIR/ordered-assignments-codegen-todo.fir


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/TemporaryStorage.h b/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
index 4a96b11d44804..88bf4af382724 100644
--- a/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
+++ b/flang/include/flang/Optimizer/Builder/TemporaryStorage.h
@@ -93,5 +93,52 @@ class HomogeneousScalarStack {
   /// Temporary storage.
   mlir::Value temp;
 };
+
+/// Structure to hold the value of a single entity.
+class SimpleCopy {
+public:
+  SimpleCopy(mlir::Location loc, fir::FirOpBuilder &builder,
+             hlfir::Entity source, llvm::StringRef tempName);
+
+  void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
+                 mlir::Value value) {
+    assert(false && "must not be called: value already set");
+  }
+  void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder){};
+  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder) {
+    return copy.getBase();
+  }
+  void destroy(mlir::Location loc, fir::FirOpBuilder &builder);
+
+public:
+  /// Temporary storage for the copy.
+  hlfir::AssociateOp copy;
+};
+
+/// Generic wrapper over the 
diff erent sorts of temporary storages.
+class TemporaryStorage {
+public:
+  template <typename T>
+  TemporaryStorage(T &&impl) : impl{std::forward<T>(impl)} {}
+
+  void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
+                 mlir::Value value) {
+    std::visit([&](auto &temp) { temp.pushValue(loc, builder, value); }, impl);
+  }
+  void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder) {
+    std::visit([&](auto &temp) { temp.resetFetchPosition(loc, builder); },
+               impl);
+  }
+  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder) {
+    return std::visit([&](auto &temp) { return temp.fetch(loc, builder); },
+                      impl);
+  }
+  void destroy(mlir::Location loc, fir::FirOpBuilder &builder) {
+    std::visit([&](auto &temp) { temp.destroy(loc, builder); }, impl);
+  }
+
+private:
+  std::variant<HomogeneousScalarStack, SimpleCopy> impl;
+};
 } // namespace fir::factory
 #endif // FORTRAN_OPTIMIZER_BUILDER_TEMPORARYSTORAGE_H

diff  --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index d707d623bc9c8..b4e01556af086 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -10,8 +10,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/TemporaryStorage.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
-#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 
@@ -133,3 +133,24 @@ hlfir::Entity fir::factory::HomogeneousScalarStack::moveStackAsArrayExpr(
   auto hlfirExpr = builder.create<hlfir::AsExprOp>(loc, temp, mustFree);
   return hlfir::Entity{hlfirExpr};
 }
+
+//===----------------------------------------------------------------------===//
+// fir::factory::SimpleCopy implementation.
+//===----------------------------------------------------------------------===//
+
+fir::factory::SimpleCopy::SimpleCopy(mlir::Location loc,
+                                     fir::FirOpBuilder &builder,
+                                     hlfir::Entity source,
+                                     llvm::StringRef tempName) {
+  // Use hlfir.as_expr and hlfir.associate to create a copy and leave
+  // bufferization deals with how best to make the copy.
+  if (source.isVariable())
+    source = hlfir::Entity{builder.create<hlfir::AsExprOp>(loc, source)};
+  copy = hlfir::genAssociateExpr(loc, builder, source,
+                                 source.getFortranElementType(), tempName);
+}
+
+void fir::factory::SimpleCopy::destroy(mlir::Location loc,
+                                       fir::FirOpBuilder &builder) {
+  builder.create<hlfir::EndAssociateOp>(loc, copy);
+}

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 0317f83063f5e..1ec3aca640cb4 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -20,9 +20,11 @@
 #include "ScheduleOrderedAssignments.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/TemporaryStorage.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallSet.h"
@@ -106,8 +108,20 @@ class OrderedAssignmentRewriter {
     currentRun = nullptr;
     assert(constructStack.empty() && "must exit constructs after a run");
     mapper.clear();
+    savedInCurrentRunBeforeUse.clear();
   }
 
+  /// After all run have been lowered, clean-up all the temporary
+  /// storage that were created (do not call final routines).
+  void cleanupSavedEntities() {
+    for (auto &temp : savedEntities)
+      temp.second.destroy(root.getLoc(), builder);
+  }
+
+  /// Lowered value for an expression, and the original hlfir.yield if any
+  /// clean-up needs to be cloned after usage.
+  using ValueAndCleanUp = std::pair<mlir::Value, std::optional<hlfir::YieldOp>>;
+
 private:
   /// Walk the part of an order assignment tree node that needs
   /// to be evaluated in the current run.
@@ -126,11 +140,16 @@ class OrderedAssignmentRewriter {
   void post(hlfir::ForallMaskOp);
   void post(hlfir::WhereOp);
   void post(hlfir::ElseWhereOp);
+  /// Enter (and maybe create) the fir.if else block of an ElseWhereOp,
+  /// but do not generate the elswhere mask or the new fir.if.
+  void enterElsewhere(hlfir::ElseWhereOp);
 
   /// Is this an assignment to a vector subscripted entity?
   static bool hasVectorSubscriptedLhs(hlfir::RegionAssignOp regionAssignOp);
   /// Are they any leaf region in node that must be saved in the current run?
-  bool mustSavedRegionIn(hlfir::OrderedAssignmentTreeOpInterface node) const;
+  bool mustSaveRegionIn(
+      hlfir::OrderedAssignmentTreeOpInterface node,
+      llvm::SmallVectorImpl<hlfir::SaveEntity> &saveEntities) const;
   /// Should this node be evaluated in the current run? Saving a region in a
   /// node does not imply the node needs to be evaluated.
   bool
@@ -154,7 +173,7 @@ class OrderedAssignmentRewriter {
   /// should be done after using the entity. Like, generateYieldedScalarValue,
   /// this will return the saved value if the region was saved in a previous
   /// run.
-  std::pair<mlir::Value, std::optional<hlfir::YieldOp>>
+  ValueAndCleanUp
   generateYieldedEntity(mlir::Region &region,
                         std::optional<mlir::Type> castToType = std::nullopt);
 
@@ -173,8 +192,43 @@ class OrderedAssignmentRewriter {
   mlir::Value generateMaskedEntity(MaskedArrayExpr &maskedExpr);
 
   /// Create a fir.if at the current position inside the where loop nest
-  /// given a mask expression.
-  void generateMaskIfOp(MaskedArrayExpr &mask);
+  /// given the element value of a mask.
+  void generateMaskIfOp(mlir::Value cdt);
+
+  /// Save a value for subsequent runs.
+  void generateSaveEntity(hlfir::SaveEntity savedEntity,
+                          bool willUseSavedEntityInSameRun);
+
+  /// Get a value if it was saved in this run or a previous run. Returns
+  /// nullopt if it has not been saved.
+  std::optional<ValueAndCleanUp> getIfSaved(mlir::Region &region);
+
+  /// Generate code before the loop nest for the current run, if any.
+  void doBeforeLoopNest(const std::function<void()> &callback) {
+    if (constructStack.empty()) {
+      callback();
+      return;
+    }
+    auto insertionPoint = builder.saveInsertionPoint();
+    builder.setInsertionPoint(constructStack[0]);
+    callback();
+    builder.restoreInsertionPoint(insertionPoint);
+  }
+
+  /// Can the current loop nest iteration number be computed? For simplicity,
+  /// this is true if an only if all the bounds and steps of the fir.do_loop
+  /// nest dominates the outer loop. The argument is filled with the current
+  /// loop nest on success.
+  bool currentLoopNestIterationNumberCanBeComputed(
+      llvm::SmallVectorImpl<fir::DoLoopOp> &loopNest);
+
+  template <typename T>
+  fir::factory::TemporaryStorage *insertSavedEntity(mlir::Region &region,
+                                                    T &&temp) {
+    auto inserted = savedEntities.try_emplace(&region, std::forward<T>(temp));
+    assert(inserted.second && "temp must have been emplaced");
+    return &inserted.first->second;
+  }
 
   fir::FirOpBuilder &builder;
 
@@ -182,6 +236,10 @@ class OrderedAssignmentRewriter {
   /// operations and the operations that have been cloned in the current run.
   /// It is reset between two runs.
   mlir::IRMapping mapper;
+  /// Dominance info is used to determine if inner loop bounds are all computed
+  /// before outer loop for the current loop. It does not need to be reset
+  /// between runs.
+  mlir::DominanceInfo dominanceInfo;
   /// Construct stack in the current run. This allows setting back the insertion
   /// point correctly when leaving a node that requires a fir.do_loop or fir.if
   /// operation.
@@ -189,20 +247,50 @@ class OrderedAssignmentRewriter {
   /// Current where loop nest, if any.
   std::optional<hlfir::LoopNest> whereLoopNest;
 
+  /// Map of temporary storage to keep track of saved entity once the run
+  /// that saves them has been lowered. It is kept in-between runs.
+  llvm::DenseMap<mlir::Region *, fir::factory::TemporaryStorage> savedEntities;
+  /// Map holding the value that were saved in the current run and that also
+  /// need to be used (because their construct will be visited). It is reset
+  /// after each run. It avoids having to store and fetch in the temporary
+  /// during the same run, which would required the temporary to have 
diff erent
+  /// fetching and storing counters.
+  llvm::DenseMap<mlir::Region *, ValueAndCleanUp> savedInCurrentRunBeforeUse;
+
   /// Root of the order assignment tree being lowered.
   hlfir::OrderedAssignmentTreeOpInterface root;
   /// Pointer to the current run of the schedule being lowered.
   hlfir::Run *currentRun = nullptr;
+
+  /// When allocating temporary storage inlined, indicate if the storage should
+  /// be heap or stack allocated. Temporary allocated with the runtime are heap
+  /// allocated by the runtime.
+  bool allocateOnHeap = true;
 };
 } // namespace
 
 void OrderedAssignmentRewriter::walk(
     hlfir::OrderedAssignmentTreeOpInterface node) {
-  if (mustSavedRegionIn(node))
-    TODO(node.getLoc(),
-         "creating temporary storage in FORALL or WHERE constructs");
-  if (isRequiredInCurrentRun(node) || mlir::isa<hlfir::ForallIndexOp>(node)) {
-    llvm::TypeSwitch<mlir::Operation *, void>(node.getOperation())
+  bool mustVisit =
+      isRequiredInCurrentRun(node) || mlir::isa<hlfir::ForallIndexOp>(node);
+  llvm::SmallVector<hlfir::SaveEntity> saveEntities;
+  mlir::Operation *nodeOp = node.getOperation();
+  if (mustSaveRegionIn(node, saveEntities)) {
+    mlir::IRRewriter::InsertPoint insertionPoint;
+    if (auto elseWhereOp = mlir::dyn_cast<hlfir::ElseWhereOp>(nodeOp)) {
+      // ElseWhere mask to save must be evaluated inside the fir.if else
+      // for the previous where/elsewehere (its evaluation must be
+      // masked by the "pending control mask").
+      insertionPoint = builder.saveInsertionPoint();
+      enterElsewhere(elseWhereOp);
+    }
+    for (hlfir::SaveEntity saveEntity : saveEntities)
+      generateSaveEntity(saveEntity, mustVisit);
+    if (insertionPoint.isSet())
+      builder.restoreInsertionPoint(insertionPoint);
+  }
+  if (mustVisit) {
+    llvm::TypeSwitch<mlir::Operation *, void>(nodeOp)
         .Case<hlfir::ForallOp, hlfir::ForallIndexOp, hlfir::ForallMaskOp,
               hlfir::RegionAssignOp, hlfir::WhereOp, hlfir::ElseWhereOp>(
             [&](auto concreteOp) { pre(concreteOp); })
@@ -212,7 +300,7 @@ void OrderedAssignmentRewriter::walk(
         if (auto subNode =
                 mlir::dyn_cast<hlfir::OrderedAssignmentTreeOpInterface>(op))
           walk(subNode);
-      llvm::TypeSwitch<mlir::Operation *, void>(node.getOperation())
+      llvm::TypeSwitch<mlir::Operation *, void>(nodeOp)
           .Case<hlfir::ForallOp, hlfir::ForallMaskOp, hlfir::WhereOp,
                 hlfir::ElseWhereOp>([&](auto concreteOp) { post(concreteOp); })
           .Default([](auto) {});
@@ -292,14 +380,11 @@ void OrderedAssignmentRewriter::pre(hlfir::RegionAssignOp regionAssignOp) {
   generateCleanupIfAny(oldLhsYield);
 }
 
-void OrderedAssignmentRewriter::generateMaskIfOp(MaskedArrayExpr &mask) {
-  assert(whereLoopNest.has_value() && "must be inside a WHERE");
-  mlir::Location loc = mask.loc;
-  hlfir::Entity maskVal{generateMaskedEntity(mask)};
-  maskVal = hlfir::loadTrivialScalar(loc, builder, maskVal);
-  mlir::Value cdt = builder.createConvert(loc, builder.getI1Type(), maskVal);
-  // Else region is added when visiting nested hlfir.elseWhereOp, if any.
-  auto ifOp = builder.create<fir::IfOp>(loc, std::nullopt, cdt,
+void OrderedAssignmentRewriter::generateMaskIfOp(mlir::Value cdt) {
+  mlir::Location loc = cdt.getLoc();
+  cdt = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{cdt});
+  cdt = builder.createConvert(loc, builder.getI1Type(), cdt);
+  auto ifOp = builder.create<fir::IfOp>(cdt.getLoc(), std::nullopt, cdt,
                                         /*withElseRegion=*/false);
   constructStack.push_back(ifOp.getOperation());
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
@@ -307,18 +392,46 @@ void OrderedAssignmentRewriter::generateMaskIfOp(MaskedArrayExpr &mask) {
 
 void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
   mlir::Location loc = whereOp.getLoc();
-  MaskedArrayExpr mask(loc, whereOp.getMaskRegion());
   if (!whereLoopNest) {
-    // Start a loop nest iterating on the shape of the where mask.
+    // This is the top-level WHERE. Start a loop nest iterating on the shape of
+    // the where mask.
+    if (auto maybeSaved = getIfSaved(whereOp.getMaskRegion())) {
+      // Use the saved value to get the shape and condition element.
+      hlfir::Entity savedMask{maybeSaved->first};
+      mlir::Value shape = hlfir::genShape(loc, builder, savedMask);
+      whereLoopNest = hlfir::genLoopNest(loc, builder, shape);
+      constructStack.push_back(whereLoopNest->outerLoop.getOperation());
+      builder.setInsertionPointToStart(whereLoopNest->innerLoop.getBody());
+      mlir::Value cdt = hlfir::getElementAt(loc, builder, savedMask,
+                                            whereLoopNest->oneBasedIndices);
+      generateMaskIfOp(cdt);
+      if (maybeSaved->second) {
+        // If this is the same run as the one that saved the value, the clean-up
+        // was left-over to be done now.
+        auto insertionPoint = builder.saveInsertionPoint();
+        builder.setInsertionPointAfter(whereLoopNest->outerLoop);
+        generateCleanupIfAny(maybeSaved->second);
+        builder.restoreInsertionPoint(insertionPoint);
+      }
+      return;
+    }
+    // The mask was not evaluated yet or can be safely re-evaluated.
+    MaskedArrayExpr mask(loc, whereOp.getMaskRegion());
     mask.generateNoneElementalPart(builder, mapper);
     mlir::Value shape = mask.generateShape(builder, mapper);
     whereLoopNest = hlfir::genLoopNest(loc, builder, shape);
     constructStack.push_back(whereLoopNest->outerLoop.getOperation());
     builder.setInsertionPointToStart(whereLoopNest->innerLoop.getBody());
+    mlir::Value cdt = generateMaskedEntity(mask);
+    generateMaskIfOp(cdt);
+    return;
   }
+  // Where Loops have been already created by a parent WHERE.
   // Generate a fir.if with the value of the current element of the mask
-  // inside the loops.
-  generateMaskIfOp(mask);
+  // inside the loops. The case where the mask was saved is handled in the
+  // generateYieldedScalarValue call.
+  mlir::Value cdt = generateYieldedScalarValue(whereOp.getMaskRegion());
+  generateMaskIfOp(cdt);
 }
 
 void OrderedAssignmentRewriter::post(hlfir::WhereOp whereOp) {
@@ -333,20 +446,27 @@ void OrderedAssignmentRewriter::post(hlfir::WhereOp whereOp) {
   }
 }
 
-void OrderedAssignmentRewriter::pre(hlfir::ElseWhereOp elseWhereOp) {
-  assert(!constructStack.empty() && "cannot be empty inside a where");
-  mlir::Location loc = elseWhereOp.getLoc();
+void OrderedAssignmentRewriter::enterElsewhere(hlfir::ElseWhereOp elseWhereOp) {
   // Create an "else" region for the current where/elsewhere fir.if.
   auto ifOp = mlir::dyn_cast<fir::IfOp>(constructStack.back());
-  assert(ifOp && ifOp.getElseRegion().empty() && "must be an if without else");
-  builder.createBlock(&ifOp.getElseRegion());
-  auto end = builder.create<fir::ResultOp>(loc);
-  builder.setInsertionPoint(end);
+  assert(ifOp && "must be an if");
+  if (ifOp.getElseRegion().empty()) {
+    mlir::Location loc = elseWhereOp.getLoc();
+    builder.createBlock(&ifOp.getElseRegion());
+    auto end = builder.create<fir::ResultOp>(loc);
+    builder.setInsertionPoint(end);
+  } else {
+    builder.setInsertionPoint(&ifOp.getElseRegion().back().back());
+  }
+}
+
+void OrderedAssignmentRewriter::pre(hlfir::ElseWhereOp elseWhereOp) {
+  enterElsewhere(elseWhereOp);
   if (elseWhereOp.getMaskRegion().empty())
     return;
   // Create new nested fir.if with elsewhere mask if any.
-  MaskedArrayExpr mask(loc, elseWhereOp.getMaskRegion());
-  generateMaskIfOp(mask);
+  mlir::Value cdt = generateYieldedScalarValue(elseWhereOp.getMaskRegion());
+  generateMaskIfOp(cdt);
 }
 
 void OrderedAssignmentRewriter::post(hlfir::ElseWhereOp elseWhereOp) {
@@ -370,14 +490,51 @@ static bool isForallIndex(mlir::Value value) {
   return value.getDefiningOp<hlfir::ForallIndexOp>();
 }
 
-std::pair<mlir::Value, std::optional<hlfir::YieldOp>>
+static OrderedAssignmentRewriter::ValueAndCleanUp
+castIfNeeded(mlir::Location loc, fir::FirOpBuilder &builder,
+             OrderedAssignmentRewriter::ValueAndCleanUp valueAndCleanUp,
+             std::optional<mlir::Type> castToType) {
+  if (!castToType.has_value())
+    return valueAndCleanUp;
+  mlir::Value cast =
+      builder.createConvert(loc, *castToType, valueAndCleanUp.first);
+  return {cast, valueAndCleanUp.second};
+}
+
+std::optional<OrderedAssignmentRewriter::ValueAndCleanUp>
+OrderedAssignmentRewriter::getIfSaved(mlir::Region &region) {
+  mlir::Location loc = region.getParentOp()->getLoc();
+  // If the region was saved in the same run, use the value that was evaluated
+  // instead of fetching the temp, and do clean-up, if any, that were delayed.
+  // This is done to avoid requiring the temporary stack to have 
diff erent
+  // fetching and storing counters, and also because it produces slightly better
+  // code.
+  if (auto savedInSameRun = savedInCurrentRunBeforeUse.find(&region);
+      savedInSameRun != savedInCurrentRunBeforeUse.end())
+    return savedInSameRun->second;
+  // If the region was saved in a previous run, fetch the saved value.
+  if (auto temp = savedEntities.find(&region); temp != savedEntities.end()) {
+    doBeforeLoopNest([&]() { temp->second.resetFetchPosition(loc, builder); });
+    return ValueAndCleanUp{temp->second.fetch(loc, builder), std::nullopt};
+  }
+  return std::nullopt;
+}
+
+OrderedAssignmentRewriter::ValueAndCleanUp
 OrderedAssignmentRewriter::generateYieldedEntity(
     mlir::Region &region, std::optional<mlir::Type> castToType) {
-  // TODO: if the region was saved, use that instead of generating code again.
+  mlir::Location loc = region.getParentOp()->getLoc();
+  if (auto maybeValueAndCleanUp = getIfSaved(region))
+    return castIfNeeded(loc, builder, *maybeValueAndCleanUp, castToType);
+  // Otherwise, evaluate the region now.
+
+  // Masked expression must not evaluate the elemental parts that are masked,
+  // they have custom code generation.
   if (whereLoopNest.has_value()) {
-    mlir::Location loc = region.getParentOp()->getLoc();
-    return {generateMaskedEntity(loc, region), std::nullopt};
+    mlir::Value maskedValue = generateMaskedEntity(loc, region);
+    return castIfNeeded(loc, builder, {maskedValue, std::nullopt}, castToType);
   }
+
   assert(region.hasOneBlock() && "region must contain one block");
   auto oldYield = mlir::dyn_cast_or_null<hlfir::YieldOp>(
       region.back().getOperations().back());
@@ -434,7 +591,9 @@ OrderedAssignmentRewriter::generateYieldedEntity(
 
 mlir::Value OrderedAssignmentRewriter::generateYieldedScalarValue(
     mlir::Region &region, std::optional<mlir::Type> castToType) {
+  mlir::Location loc = region.getParentOp()->getLoc();
   auto [value, maybeYield] = generateYieldedEntity(region, castToType);
+  value = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{value});
   assert(fir::isa_trivial(value.getType()) && "not a trivial scalar value");
   generateCleanupIfAny(maybeYield);
   return value;
@@ -468,7 +627,8 @@ void OrderedAssignmentRewriter::generateCleanupIfAny(
       assert(maybeYield->getCleanup().hasOneBlock() &&
              "region must contain one block");
       for (auto &op : maybeYield->getCleanup().back().getOperations())
-        builder.clone(op, mapper);
+        if (!mlir::isa<fir::FirEndOp>(op))
+          builder.clone(op, mapper);
     }
 }
 
@@ -478,14 +638,15 @@ bool OrderedAssignmentRewriter::hasVectorSubscriptedLhs(
       regionAssignOp.getLhsRegion().back().back());
 }
 
-bool OrderedAssignmentRewriter::mustSavedRegionIn(
-    hlfir::OrderedAssignmentTreeOpInterface node) const {
+bool OrderedAssignmentRewriter::mustSaveRegionIn(
+    hlfir::OrderedAssignmentTreeOpInterface node,
+    llvm::SmallVectorImpl<hlfir::SaveEntity> &saveEntities) const {
   for (auto &action : currentRun->actions)
     if (hlfir::SaveEntity *savedEntity =
             std::get_if<hlfir::SaveEntity>(&action))
       if (node.getOperation() == savedEntity->yieldRegion->getParentOp())
-        return true;
-  return false;
+        saveEntities.push_back(*savedEntity);
+  return !saveEntities.empty();
 }
 
 bool OrderedAssignmentRewriter::isRequiredInCurrentRun(
@@ -634,6 +795,125 @@ void MaskedArrayExpr::generateNoneElementalCleanupIfAny(
   }
 }
 
+static bool isLeftHandSide(mlir::Region &region) {
+  auto assign = mlir::dyn_cast<hlfir::RegionAssignOp>(region.getParentOp());
+  return assign && (&assign.getLhsRegion() == &region);
+}
+
+bool OrderedAssignmentRewriter::currentLoopNestIterationNumberCanBeComputed(
+    llvm::SmallVectorImpl<fir::DoLoopOp> &loopNest) {
+  if (constructStack.empty())
+    return true;
+  mlir::Operation *outerLoop = constructStack[0];
+  mlir::Operation *currentConstruct = constructStack.back();
+  // Loop through the loops until the outer construct is met, and test if the
+  // loop operands dominate the outer construct.
+  while (currentConstruct) {
+    if (auto doLoop = mlir::dyn_cast<fir::DoLoopOp>(currentConstruct)) {
+      if (llvm::any_of(doLoop->getOperands(), [&](mlir::Value value) {
+            return !dominanceInfo.properlyDominates(value, outerLoop);
+          })) {
+        return false;
+      }
+      loopNest.push_back(doLoop);
+    }
+    if (currentConstruct == outerLoop)
+      currentConstruct = nullptr;
+    else
+      currentConstruct = currentConstruct->getParentOp();
+  }
+  return true;
+}
+
+static mlir::Value
+computeLoopNestIterationNumber(mlir::Location loc, fir::FirOpBuilder &builder,
+                               llvm::ArrayRef<fir::DoLoopOp> loopNest) {
+  mlir::Value loopExtent;
+  for (fir::DoLoopOp doLoop : loopNest) {
+    mlir::Value extent = builder.genExtentFromTriplet(
+        loc, doLoop.getLowerBound(), doLoop.getUpperBound(), doLoop.getStep(),
+        builder.getIndexType());
+    if (!loopExtent)
+      loopExtent = extent;
+    else
+      loopExtent = builder.create<mlir::arith::MulIOp>(loc, loopExtent, extent);
+  }
+  assert(loopExtent && "loopNest must not be empty");
+  return loopExtent;
+}
+
+void OrderedAssignmentRewriter::generateSaveEntity(
+    hlfir::SaveEntity savedEntity, bool willUseSavedEntityInSameRun) {
+  mlir::Region &region = *savedEntity.yieldRegion;
+  mlir::Location loc = region.getParentOp()->getLoc();
+
+  if (!mlir::isa<hlfir::YieldOp>(region.back().back()))
+    TODO(loc, "creating temporary storage for vector subscripted LHS");
+
+  // Evaluate the region inside the loop nest (if any).
+  auto [clonedValue, oldYield] = generateYieldedEntity(region);
+  hlfir::Entity entity{clonedValue};
+  if (isLeftHandSide(region)) // Need to save the address, not the values.
+    TODO(loc, "creating temporary storage for LHS");
+  else
+    entity = hlfir::loadTrivialScalar(loc, builder, entity);
+  mlir::Type entityType = entity.getType();
+
+  static constexpr char tempName[] = ".tmp.forall";
+  if (constructStack.empty()) {
+    // Value evaluated outside of any loops (this may be the first MASK of a
+    // WHERE construct, or an LHS/RHS temp of hlfir.region_assign outside of
+    // WHERE/FORALL).
+    insertSavedEntity(region,
+                      fir::factory::SimpleCopy(loc, builder, entity, tempName));
+  } else {
+    // Need to create a temporary for values computed inside loops.
+    // Create temporary storage outside of the loop nest given the entity
+    // type (and the loop context).
+    fir::factory::TemporaryStorage *temp;
+    llvm::SmallVector<fir::DoLoopOp> loopNest;
+    bool loopShapeCanBePreComputed =
+        currentLoopNestIterationNumberCanBeComputed(loopNest);
+    doBeforeLoopNest([&] {
+      /// For simple scalars inside loops whose total iteration number can be
+      /// pre-computed, create a rank-1 array outside of the loops. It will be
+      /// assigned/fetched inside the loops like a normal Fortran array given
+      /// the iteration count.
+      if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
+        mlir::Value loopExtent =
+            computeLoopNestIterationNumber(loc, builder, loopNest);
+        auto sequenceType =
+            builder.getVarLenSeqTy(entityType).cast<fir::SequenceType>();
+        temp = insertSavedEntity(region,
+                                 fir::factory::HomogeneousScalarStack{
+                                     loc, builder, sequenceType, loopExtent,
+                                     /*lenParams=*/{}, allocateOnHeap,
+                                     /*stackThroughLoops=*/true, tempName});
+
+      } else {
+        // If the number of iteration is not known, or if the values at each
+        // iterations are values that may have 
diff erent shape, type parameters
+        // or dynamic type, use the runtime to create and manage a stack-like
+        // temporary.
+        TODO(loc, "use runtime to create temporary storage in FORALL or WHERE");
+      }
+    });
+    // Inside the loop nest (and any fir.if if there are active masks), copy
+    // the value to the temp and do clean-ups for the value if any.
+    temp->pushValue(loc, builder, entity);
+  }
+
+  // Delay the clean-up if the entity will be used in the same run (i.e., the
+  // parent construct will be visited and needs to be lowered).
+  if (willUseSavedEntityInSameRun) {
+    auto inserted =
+        savedInCurrentRunBeforeUse.try_emplace(&region, entity, oldYield);
+    assert(inserted.second && "entity must have been emplaced");
+  } else {
+    generateCleanupIfAny(oldYield);
+  }
+}
+
 /// Lower an ordered assignment tree to fir.do_loop and hlfir.assign given
 /// a schedule.
 static void lower(hlfir::OrderedAssignmentTreeOpInterface root,
@@ -643,6 +923,7 @@ static void lower(hlfir::OrderedAssignmentTreeOpInterface root,
   OrderedAssignmentRewriter assignmentRewriter(builder, root);
   for (auto &run : schedule)
     assignmentRewriter.lowerRun(run);
+  assignmentRewriter.cleanupSavedEntities();
 }
 
 /// Shared rewrite entry point for all the ordered assignment tree root

diff  --git a/flang/test/HLFIR/order_assignments/impure-where.fir b/flang/test/HLFIR/order_assignments/impure-where.fir
new file mode 100644
index 0000000000000..537fd48282cf8
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/impure-where.fir
@@ -0,0 +1,73 @@
+// Test code generation of hlfir.where/hflir.elsewhere when an
+// "impure" mask is used and several runs are needed. The mask
+// must be saved so that the impure function is only evaluated once.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments | FileCheck %s
+
+func.func private @impure() -> !fir.heap<!fir.array<10x!fir.logical<4>>>
+func.func @test_elsewhere_impure_mask(%x: !fir.ref<!fir.array<10xi32>>, %y: !fir.ref<!fir.array<10xi32>>, %z: !fir.ref<!fir.array<10xi32>>, %mask: !fir.ref<!fir.array<10x!fir.logical<4>>>) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  hlfir.where {
+    hlfir.yield %mask : !fir.ref<!fir.array<10x!fir.logical<4>>>
+  } do {
+    hlfir.elsewhere mask {
+      %mask2 = fir.call @impure() : () -> !fir.heap<!fir.array<10x!fir.logical<4>>>
+      hlfir.yield %mask2 : !fir.heap<!fir.array<10x!fir.logical<4>>> cleanup {
+        fir.freemem %mask2 : !fir.heap<!fir.array<10x!fir.logical<4>>>
+      }
+    } do {
+      hlfir.region_assign {
+        hlfir.yield %y : !fir.ref<!fir.array<10xi32>>
+      } to {
+        hlfir.yield %x : !fir.ref<!fir.array<10xi32>>
+      }
+      hlfir.region_assign {
+        hlfir.yield %x : !fir.ref<!fir.array<10xi32>>
+      } to {
+        hlfir.yield %z : !fir.ref<!fir.array<10xi32>>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_elsewhere_impure_mask(
+// CHECK:           %[[VAL_12:.*]] = fir.call @impure() : () -> !fir.heap<!fir.array<10x!fir.logical<4>>>
+// CHECK:           %[[VAL_21:.*]] = fir.allocmem !fir.array<?x!fir.logical<4>>
+// CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%{{.*}}) {uniq_name = ".tmp.forall"}
+// CHECK:           fir.do_loop
+// CHECK:             fir.if {{.*}} {
+// CHECK:             } else {
+// CHECK:               %[[VAL_28:.*]] = hlfir.designate %[[VAL_12]] (%{{.*}})
+// CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               hlfir.assign %[[VAL_29]] to %[[VAL_32]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+// CHECK:             }
+// CHECK:           }
+// CHECK-NOT:       fir.call @impure
+// CHECK:           fir.do_loop
+// CHECK:             fir.if {{.*}} {
+// CHECK:             } else {
+// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+// CHECK:               fir.if %[[VAL_44]] {
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK-NOT:       fir.call @impure
+// CHECK:           fir.do_loop
+// CHECK:             fir.if {{.*}} {
+// CHECK:             } else {
+// CHECK:               %[[VAL_52:.*]] = hlfir.designate %[[VAL_23]]#0 (%{{.*}})  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_53:.*]] = fir.load %[[VAL_52]] : !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_53]] : (!fir.logical<4>) -> i1
+// CHECK:               fir.if %[[VAL_54]] {
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           fir.freemem %[[VAL_21]] : !fir.heap<!fir.array<?x!fir.logical<4>>>
+// CHECK:           fir.freemem %[[VAL_12]] : !fir.heap<!fir.array<10x!fir.logical<4>>>
+// CHECK:           return
+// CHECK:         }

diff  --git a/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
new file mode 100644
index 0000000000000..6566620a51bfc
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/inlined-stack-temp.fir
@@ -0,0 +1,332 @@
+// Test code generation of hlfir.forall and hlfir.where when temporary
+// storage is needed and can be allocated inline.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments | FileCheck %s
+
+func.func @test_scalar_save(%arg0: !fir.box<!fir.array<?xi32>>) {
+  %c10_i32 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1_i32 : i32
+  } ub {
+    hlfir.yield %c10_i32 : i32
+  }  (%arg1: i32) {
+    hlfir.region_assign {
+      %1 = fir.convert %arg1 : (i32) -> i64
+      %2 = hlfir.designate %0#0 (%1)  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+      %3 = fir.load %2 : !fir.ref<i32>
+      hlfir.yield %3 : i32
+    } to {
+      %1 = arith.addi %arg1, %c1_i32 : i32
+      %2 = fir.convert %1 : (i32) -> i64
+      %3 = hlfir.designate %0#0 (%2)  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+      hlfir.yield %3 : !fir.ref<i32>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_scalar_save(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = fir.alloca index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_6]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_15:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_14]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           %[[VAL_16:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_13]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_16]](%[[VAL_17]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_19:.*]] = %[[VAL_5]] to %[[VAL_6]] step %[[VAL_7]] {
+// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (index) -> i32
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
+// CHECK:             %[[VAL_22:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_21]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<i32>
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_15]] : index
+// CHECK:             fir.store %[[VAL_25]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_26:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_24]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_23]] to %[[VAL_26]] : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_3]] : (i32) -> index
+// CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_29:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_14]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           fir.do_loop %[[VAL_30:.*]] = %[[VAL_27]] to %[[VAL_28]] step %[[VAL_29]] {
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (index) -> i32
+// CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[VAL_15]] : index
+// CHECK:             fir.store %[[VAL_33]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_32]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:             %[[VAL_36:.*]] = arith.addi %[[VAL_31]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> i64
+// CHECK:             %[[VAL_38:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_37]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_35]] to %[[VAL_38]] : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           fir.freemem %[[VAL_16]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           return
+// CHECK:         }
+
+func.func @mask_and_rhs_conflict(%arg0: !fir.box<!fir.array<?xi32>>) {
+  %c42_i32 = arith.constant 42 : i32
+  %c10_i32 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1_i32 : i32
+  } ub {
+    hlfir.yield %c10_i32 : i32
+  }  (%arg1: i32) {
+    hlfir.forall_mask {
+      %1 = fir.convert %arg1 : (i32) -> i64
+      %2 = hlfir.designate %0#0 (%1)  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+      %3 = fir.load %2 : !fir.ref<i32>
+      %4 = arith.cmpi sgt, %3, %c42_i32 : i32
+      hlfir.yield %4 : i1
+    } do {
+      hlfir.region_assign {
+        %1 = fir.convert %arg1 : (i32) -> i64
+        %2 = hlfir.designate %0#0 (%1)  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+        %3 = fir.load %2 : !fir.ref<i32>
+        hlfir.yield %3 : i32
+      } to {
+        %1 = arith.addi %arg1, %c1_i32 : i32
+        %2 = fir.convert %1 : (i32) -> i64
+        %3 = hlfir.designate %0#0 (%2)  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+        hlfir.yield %3 : !fir.ref<i32>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @mask_and_rhs_conflict(
+// CHECK-SAME:                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = fir.alloca index
+// CHECK:           %[[VAL_2:.*]] = fir.alloca index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (i32) -> index
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
+// CHECK:           %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_10:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_11:.*]] = arith.subi %[[VAL_8]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.divsi %[[VAL_12]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13]], %[[VAL_10]] : index
+// CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_13]], %[[VAL_10]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_17:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_16]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           %[[VAL_18:.*]] = fir.allocmem !fir.array<?xi1>, %[[VAL_15]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[VAL_19:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_19]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi1>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi1>>, !fir.heap<!fir.array<?xi1>>)
+// CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_22:.*]] = arith.subi %[[VAL_8]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_24:.*]] = arith.divsi %[[VAL_23]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_27:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_28:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_27]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           %[[VAL_29:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_26]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_26]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_29]](%[[VAL_30]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_32:.*]] = %[[VAL_7]] to %[[VAL_8]] step %[[VAL_9]] {
+// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (index) -> i32
+// CHECK:             %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i32) -> i64
+// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_34]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:             %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<i32>
+// CHECK:             %[[VAL_37:.*]] = arith.cmpi sgt, %[[VAL_36]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_17]] : index
+// CHECK:             fir.store %[[VAL_39]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_40:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:             hlfir.assign %[[VAL_37]] to %[[VAL_40]] : i1, !fir.ref<i1>
+// CHECK:             fir.if %[[VAL_37]] {
+// CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_33]] : (i32) -> i64
+// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_41]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<i32>
+// CHECK:               %[[VAL_44:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:               %[[VAL_45:.*]] = arith.addi %[[VAL_44]], %[[VAL_28]] : index
+// CHECK:               fir.store %[[VAL_45]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:               %[[VAL_46:.*]] = hlfir.designate %[[VAL_31]]#0 (%[[VAL_44]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_43]] to %[[VAL_46]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_47:.*]] = fir.convert %[[VAL_5]] : (i32) -> index
+// CHECK:           %[[VAL_48:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
+// CHECK:           %[[VAL_49:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_16]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           fir.store %[[VAL_27]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           fir.do_loop %[[VAL_50:.*]] = %[[VAL_47]] to %[[VAL_48]] step %[[VAL_49]] {
+// CHECK:             %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (index) -> i32
+// CHECK:             %[[VAL_52:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_53:.*]] = arith.addi %[[VAL_52]], %[[VAL_17]] : index
+// CHECK:             fir.store %[[VAL_53]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_54:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_52]])  : (!fir.box<!fir.array<?xi1>>, index) -> !fir.ref<i1>
+// CHECK:             %[[VAL_55:.*]] = fir.load %[[VAL_54]] : !fir.ref<i1>
+// CHECK:             fir.if %[[VAL_55]] {
+// CHECK:               %[[VAL_56:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:               %[[VAL_57:.*]] = arith.addi %[[VAL_56]], %[[VAL_28]] : index
+// CHECK:               fir.store %[[VAL_57]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:               %[[VAL_58:.*]] = hlfir.designate %[[VAL_31]]#0 (%[[VAL_56]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_59:.*]] = fir.load %[[VAL_58]] : !fir.ref<i32>
+// CHECK:               %[[VAL_60:.*]] = arith.addi %[[VAL_51]], %[[VAL_5]] : i32
+// CHECK:               %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (i32) -> i64
+// CHECK:               %[[VAL_62:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_61]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_59]] to %[[VAL_62]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK-DAG:       fir.freemem %[[VAL_18]] : !fir.heap<!fir.array<?xi1>>
+// CHECK-DAG:       fir.freemem %[[VAL_29]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           return
+// CHECK:         }
+
+func.func @test_where_mask_save(%arg0: !fir.box<!fir.array<?xi32>>) {
+  %c0 = arith.constant 0 : index
+  %c42_i32 = arith.constant 42 : i32
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+  hlfir.where {
+    %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+    %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
+    %3 = hlfir.elemental %2 : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+    ^bb0(%arg1: index):
+      %4 = hlfir.designate %0#0 (%arg1)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      %5 = fir.load %4 : !fir.ref<i32>
+      %6 = arith.cmpi sgt, %5, %c42_i32 : i32
+      %7 = fir.convert %6 : (i1) -> !fir.logical<4>
+      hlfir.yield_element %7 : !fir.logical<4>
+    }
+    hlfir.yield %3 : !hlfir.expr<?x!fir.logical<4>> cleanup {
+      hlfir.destroy %3 : !hlfir.expr<?x!fir.logical<4>>
+    }
+  } do {
+    hlfir.region_assign {
+      hlfir.yield %c42_i32 : i32
+    } to {
+      hlfir.yield %0#0 : !fir.box<!fir.array<?xi32>>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_where_mask_save(
+// CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_7]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_2]] : i32
+// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i1) -> !fir.logical<4>
+// CHECK:             hlfir.yield_element %[[VAL_11]] : !fir.logical<4>
+// CHECK:           }
+// CHECK:           %[[VAL_12:.*]]:3 = hlfir.associate %[[VAL_13:.*]](%[[VAL_5]]) {uniq_name = ".tmp.forall"} : (!hlfir.expr<?x!fir.logical<4>>, !fir.shape<1>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.ref<!fir.array<?x!fir.logical<4>>>, i1)
+// CHECK:           hlfir.destroy %[[VAL_13]] : !hlfir.expr<?x!fir.logical<4>>
+// CHECK:           %[[VAL_14:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_15:.*]] = %[[VAL_14]] to %[[VAL_4]]#1 step %[[VAL_14]] {
+// CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_15]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_18]] {
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_15]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_2]] to %[[VAL_19]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           hlfir.end_associate %[[VAL_12]]#1, %[[VAL_12]]#2 : !fir.ref<!fir.array<?x!fir.logical<4>>>, i1
+// CHECK:           return
+// CHECK:         }
+
+func.func @test_where_rhs_save(%x: !fir.ref<!fir.array<10xi32>>, %mask: !fir.ref<!fir.array<10x!fir.logical<4>>>) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  hlfir.where {
+    hlfir.yield %mask : !fir.ref<!fir.array<10x!fir.logical<4>>>
+  } do {
+    hlfir.region_assign {
+      %2 = hlfir.designate %x (%c10:%c1:%c-1)  shape %1 :
+(!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+      hlfir.yield %2 : !fir.ref<!fir.array<10xi32>>
+    } to {
+      hlfir.yield %x : !fir.ref<!fir.array<10xi32>>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_where_rhs_save(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.alloca index
+// CHECK:           %[[VAL_3:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_10:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_5]]:%[[VAL_4]]:%[[VAL_3]])  shape %[[VAL_6]] : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+// CHECK:           %[[VAL_11:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_12:.*]] = arith.subi %[[VAL_7]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.divsi %[[VAL_13]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_15:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_14]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_17:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_18:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_17]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           %[[VAL_19:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_16]] {bindc_name = ".tmp.forall", uniq_name = ""}
+// CHECK:           %[[VAL_20:.*]] = fir.shape %[[VAL_16]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_19]](%[[VAL_20]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+// CHECK:           fir.do_loop %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_9]] {
+// CHECK:             %[[VAL_23:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_22]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_25]] {
+// CHECK:               %[[VAL_26:.*]] = hlfir.designate %[[VAL_10]] (%[[VAL_22]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_26]] : !fir.ref<i32>
+// CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:               %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_18]] : index
+// CHECK:               fir.store %[[VAL_29]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:               %[[VAL_30:.*]] = hlfir.designate %[[VAL_21]]#0 (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_27]] to %[[VAL_30]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_31:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_32:.*]] = fir.shape %[[VAL_31]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_33:.*]] = arith.constant 1 : index
+// CHECK:           fir.store %[[VAL_17]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           fir.do_loop %[[VAL_34:.*]] = %[[VAL_33]] to %[[VAL_31]] step %[[VAL_33]] {
+// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_34]])  : (!fir.ref<!fir.array<10x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_37]] {
+// CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:               %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_18]] : index
+// CHECK:               fir.store %[[VAL_39]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_21]]#0 (%[[VAL_38]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<i32>
+// CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_41]] to %[[VAL_42]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           fir.freemem %[[VAL_19]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:           return
+// CHECK:         }

diff  --git a/flang/test/HLFIR/ordered-assignments-codegen-todo.fir b/flang/test/HLFIR/ordered-assignments-codegen-todo.fir
deleted file mode 100644
index 6557a03219fb3..0000000000000
--- a/flang/test/HLFIR/ordered-assignments-codegen-todo.fir
+++ /dev/null
@@ -1,24 +0,0 @@
-// Just test that Ordered assignment pass TODOs are properly reported.
-// RUN: %not_todo_cmd fir-opt --lower-hlfir-ordered-assignments %s 2>&1 | FileCheck %s
-
-
-// CHECK: not yet implemented: creating temporary storage in FORALL or WHERE constructs
-
-func.func @forall_todo(%arg0: !fir.ref<!fir.array<10xf32>>) {
-  %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-  hlfir.forall lb {
-    hlfir.yield %c1 : index
-  } ub {
-    hlfir.yield %c10 : index
-  }  (%arg2: i64) {
-    hlfir.region_assign {
-      %1 = hlfir.designate %arg0 (%arg2)  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
-      hlfir.yield %1 : !fir.ref<f32>
-    } to {
-      %1 = hlfir.designate %arg0 (%arg2)  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
-      hlfir.yield %1 : !fir.ref<f32>
-    }
-  }
-  return
-}


        


More information about the flang-commits mailing list