[clang] cc14bf2 - [flang] add a pass to move array temporaries to the stack

Tue Feb 7 02:28:29 PST 2023

Author: Tom Eccles
Date: 2023-02-07T10:27:52Z
New Revision: cc14bf22bddf7abb1f13099c2d6ace5cdb8b7a5f

URL: https://github.com/llvm/llvm-project/commit/cc14bf22bddf7abb1f13099c2d6ace5cdb8b7a5f
DIFF: https://github.com/llvm/llvm-project/commit/cc14bf22bddf7abb1f13099c2d6ace5cdb8b7a5f.diff

LOG: [flang] add a pass to move array temporaries to the stack

This pass implements the `-fstack-arrays` flag. See the RFC in
`flang/docs/fstack-arrays.md` for more information.

Differential revision: https://reviews.llvm.org/D140415

Added: 
    flang/lib/Optimizer/Transforms/StackArrays.cpp
    flang/test/Transforms/stack-arrays.f90
    flang/test/Transforms/stack-arrays.fir

Modified: 
    clang/docs/tools/clang-formatted-files.txt
    flang/include/flang/Optimizer/Builder/MutableBox.h
    flang/include/flang/Optimizer/Dialect/FIRAttr.h
    flang/include/flang/Optimizer/Transforms/Passes.h
    flang/include/flang/Optimizer/Transforms/Passes.td
    flang/lib/Lower/Allocatable.cpp
    flang/lib/Optimizer/Builder/MutableBox.cpp
    flang/lib/Optimizer/Transforms/CMakeLists.txt
    flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
    flang/test/Lower/Intrinsics/c_loc.f90
    flang/test/Lower/Intrinsics/system_clock.f90

Removed: 
    


################################################################################
diff  --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 0cea49221d949..45970232d0c0f 100644

--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -2299,6 +2299,7 @@ flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
 flang/lib/Optimizer/Transforms/MemRefDataFlowOpt.cpp
 flang/lib/Optimizer/Transforms/PassDetail.h
 flang/lib/Optimizer/Transforms/RewriteLoop.cpp
+flang/lib/Optimizer/Transforms/StackArrays.cpp
 flang/lib/Parser/basic-parsers.h
 flang/lib/Parser/char-block.cpp
 flang/lib/Parser/char-buffer.cpp

diff  --git a/flang/include/flang/Optimizer/Builder/MutableBox.h b/flang/include/flang/Optimizer/Builder/MutableBox.h
index d49d2e7ae8119..3f3354d93530e 100644
--- a/flang/include/flang/Optimizer/Builder/MutableBox.h
+++ b/flang/include/flang/Optimizer/Builder/MutableBox.h
@@ -127,8 +127,8 @@ void genFinalization(fir::FirOpBuilder &builder, mlir::Location loc,
 void genInlinedAllocation(fir::FirOpBuilder &builder, mlir::Location loc,
                           const fir::MutableBoxValue &box,
                           mlir::ValueRange lbounds, mlir::ValueRange extents,
-                          mlir::ValueRange lenParams,
-                          llvm::StringRef allocName);
+                          mlir::ValueRange lenParams, llvm::StringRef allocName,
+                          bool mustBeHeap = false);
 
 void genInlinedDeallocate(fir::FirOpBuilder &builder, mlir::Location loc,
                           const fir::MutableBoxValue &box);

diff  --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.h b/flang/include/flang/Optimizer/Dialect/FIRAttr.h
index f88d6c6a4f97f..2b14e15c906c3 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.h
@@ -57,6 +57,15 @@ class SubclassAttr
   mlir::Type getType() const;
 };
 
+/// Attribute which can be applied to a fir.allocmem operation, specifying that
+/// the allocation may not be moved to the heap by passes
+class MustBeHeapAttr : public mlir::BoolAttr {
+public:
+  using BoolAttr::BoolAttr;
+
+  static constexpr llvm::StringRef getAttrName() { return "fir.must_be_heap"; }
+};
+
 // Attributes for building SELECT CASE multiway branches
 
 /// A closed interval (including the bound values) is an interval with both an

diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 54a5d958d7315..efe55cecce30b 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -55,6 +55,7 @@ std::unique_ptr<mlir::Pass> createExternalNameConversionPass();
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
 std::unique_ptr<mlir::Pass> createPromoteToAffinePass();
 std::unique_ptr<mlir::Pass> createMemoryAllocationPass();
+std::unique_ptr<mlir::Pass> createStackArraysPass();
 std::unique_ptr<mlir::Pass> createSimplifyIntrinsicsPass();
 std::unique_ptr<mlir::Pass> createAddDebugFoundationPass();
 

diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 85a412ab046d6..13a73667965e5 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -235,6 +235,16 @@ def MemoryAllocationOpt : Pass<"memory-allocation-opt", "mlir::func::FuncOp"> {
   let constructor = "::fir::createMemoryAllocationPass()";
 }
 
+def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
+  let summary = "Move local array allocations from heap memory into stack memory";
+  let description = [{
+    Convert heap allocations for arrays, even those of unknown size, into stack
+    allocations.
+  }];
+  let dependentDialects = [ "fir::FIROpsDialect" ];
+  let constructor = "::fir::createStackArraysPass()";
+}
+
 def SimplifyRegionLite : Pass<"simplify-region-lite", "mlir::ModuleOp"> {
   let summary = "Region simplification";
   let description = [{

diff  --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 610b43df7ce99..36bade3ea87fa 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -428,7 +428,8 @@ class AllocateStmtHelper {
       }
     }
     fir::factory::genInlinedAllocation(builder, loc, box, lbounds, extents,
-                                       lenParams, mangleAlloc(alloc));
+                                       lenParams, mangleAlloc(alloc),
+                                       /*mustBeHeap=*/true);
   }
 
   void genSimpleAllocation(const Allocation &alloc,

diff  --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 44b04c0b20516..1dcb8acd3f8a3 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Runtime/Stop.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
@@ -719,13 +720,11 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder,
   return newStorage;
 }
 
-void fir::factory::genInlinedAllocation(fir::FirOpBuilder &builder,
-                                        mlir::Location loc,
-                                        const fir::MutableBoxValue &box,
-                                        mlir::ValueRange lbounds,
-                                        mlir::ValueRange extents,
-                                        mlir::ValueRange lenParams,
-                                        llvm::StringRef allocName) {
+void fir::factory::genInlinedAllocation(
+    fir::FirOpBuilder &builder, mlir::Location loc,
+    const fir::MutableBoxValue &box, mlir::ValueRange lbounds,
+    mlir::ValueRange extents, mlir::ValueRange lenParams,
+    llvm::StringRef allocName, bool mustBeHeap) {
   auto lengths = getNewLengths(builder, loc, box, lenParams);
   llvm::SmallVector<mlir::Value> safeExtents;
   for (mlir::Value extent : extents)
@@ -742,6 +741,9 @@ void fir::factory::genInlinedAllocation(fir::FirOpBuilder &builder,
     mlir::Value irBox = fir::factory::getMutableIRBox(builder, loc, box);
     fir::runtime::genDerivedTypeInitialize(builder, loc, irBox);
   }
+
+  heap->setAttr(fir::MustBeHeapAttr::getAttrName(),
+                fir::MustBeHeapAttr::get(builder.getContext(), mustBeHeap));
 }
 
 void fir::factory::genInlinedDeallocate(fir::FirOpBuilder &builder,

diff  --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 8ec683aac9480..c283821573395 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -8,6 +8,7 @@ add_flang_library(FIRTransforms
   ArrayValueCopy.cpp
   ExternalNameConversion.cpp
   MemoryAllocation.cpp
+  StackArrays.cpp
   MemRefDataFlowOpt.cpp
   SimplifyRegionLite.cpp
   AlgebraicSimplification.cpp

diff  --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
new file mode 100644
index 0000000000000..876760c45f551
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -0,0 +1,773 @@
+//===- StackArrays.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Support/FIRContext.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/DenseAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+namespace fir {
+#define GEN_PASS_DEF_STACKARRAYS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "stack-arrays"
+
+namespace {
+
+/// The state of an SSA value at each program point
+enum class AllocationState {
+  /// This means that the allocation state of a variable cannot be determined
+  /// at this program point, e.g. because one route through a conditional freed
+  /// the variable and the other route didn't.
+  /// This asserts a known-unknown: 
diff erent from the unknown-unknown of having
+  /// no AllocationState stored for a particular SSA value
+  Unknown,
+  /// Means this SSA value was allocated on the heap in this function and has
+  /// now been freed
+  Freed,
+  /// Means this SSA value was allocated on the heap in this function and is a
+  /// candidate for moving to the stack
+  Allocated,
+};
+
+/// Stores where an alloca should be inserted. If the PointerUnion is an
+/// Operation the alloca should be inserted /after/ the operation. If it is a
+/// block, the alloca can be placed anywhere in that block.
+class InsertionPoint {
+  llvm::PointerUnion<mlir::Operation *, mlir::Block *> location;
+  bool saveRestoreStack;
+
+  /// Get contained pointer type or nullptr
+  template <class T>
+  T *tryGetPtr() const {
+    if (location.is<T *>())
+      return location.get<T *>();
+    return nullptr;
+  }
+
+public:
+  template <class T>
+  InsertionPoint(T *ptr, bool saveRestoreStack = false)
+      : location(ptr), saveRestoreStack{saveRestoreStack} {}
+  InsertionPoint(std::nullptr_t null)
+      : location(null), saveRestoreStack{false} {}
+
+  /// Get contained operation, or nullptr
+  mlir::Operation *tryGetOperation() const {
+    return tryGetPtr<mlir::Operation>();
+  }
+
+  /// Get contained block, or nullptr
+  mlir::Block *tryGetBlock() const { return tryGetPtr<mlir::Block>(); }
+
+  /// Get whether the stack should be saved/restored. If yes, an llvm.stacksave
+  /// intrinsic should be added before the alloca, and an llvm.stackrestore
+  /// intrinsic should be added where the freemem is
+  bool shouldSaveRestoreStack() const { return saveRestoreStack; }
+
+  operator bool() const { return tryGetOperation() || tryGetBlock(); }
+
+  bool operator==(const InsertionPoint &rhs) const {
+    return (location == rhs.location) &&
+           (saveRestoreStack == rhs.saveRestoreStack);
+  }
+
+  bool operator!=(const InsertionPoint &rhs) const { return !(*this == rhs); }
+};
+
+/// Maps SSA values to their AllocationState at a particular program point.
+/// Also caches the insertion points for the new alloca operations
+class LatticePoint : public mlir::dataflow::AbstractDenseLattice {
+  // Maps all values we are interested in to states
+  llvm::SmallDenseMap<mlir::Value, AllocationState, 1> stateMap;
+
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LatticePoint)
+  using AbstractDenseLattice::AbstractDenseLattice;
+
+  bool operator==(const LatticePoint &rhs) const {
+    return stateMap == rhs.stateMap;
+  }
+
+  /// Join the lattice accross control-flow edges
+  mlir::ChangeResult join(const AbstractDenseLattice &lattice) override;
+
+  void print(llvm::raw_ostream &os) const override;
+
+  /// Clear all modifications
+  mlir::ChangeResult reset();
+
+  /// Set the state of an SSA value
+  mlir::ChangeResult set(mlir::Value value, AllocationState state);
+
+  /// Get fir.allocmem ops which were allocated in this function and always
+  /// freed before the function returns, plus whre to insert replacement
+  /// fir.alloca ops
+  void appendFreedValues(llvm::DenseSet<mlir::Value> &out) const;
+
+  std::optional<AllocationState> get(mlir::Value val) const;
+};
+
+class AllocationAnalysis
+    : public mlir::dataflow::DenseDataFlowAnalysis<LatticePoint> {
+public:
+  using DenseDataFlowAnalysis::DenseDataFlowAnalysis;
+
+  void visitOperation(mlir::Operation *op, const LatticePoint &before,
+                      LatticePoint *after) override;
+
+  /// At an entry point, the last modifications of all memory resources are
+  /// yet to be determined
+  void setToEntryState(LatticePoint *lattice) override;
+
+protected:
+  /// Visit control flow operations and decide whether to call visitOperation
+  /// to apply the transfer function
+  void processOperation(mlir::Operation *op) override;
+};
+
+/// Drives analysis to find candidate fir.allocmem operations which could be
+/// moved to the stack. Intended to be used with mlir::Pass::getAnalysis
+class StackArraysAnalysisWrapper {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StackArraysAnalysisWrapper)
+
+  // Maps fir.allocmem -> place to insert alloca
+  using AllocMemMap = llvm::DenseMap<mlir::Operation *, InsertionPoint>;
+
+  StackArraysAnalysisWrapper(mlir::Operation *op) {}
+
+  bool hasErrors() const;
+
+  const AllocMemMap &getCandidateOps(mlir::Operation *func);
+
+private:
+  llvm::DenseMap<mlir::Operation *, AllocMemMap> funcMaps;
+  bool gotError = false;
+
+  void analyseFunction(mlir::Operation *func);
+};
+
+/// Converts a fir.allocmem to a fir.alloca
+class AllocMemConversion : public mlir::OpRewritePattern<fir::AllocMemOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  AllocMemConversion(
+      mlir::MLIRContext *ctx,
+      const llvm::DenseMap<mlir::Operation *, InsertionPoint> &candidateOps);
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::AllocMemOp allocmem,
+                  mlir::PatternRewriter &rewriter) const override;
+
+  /// Determine where to insert the alloca operation. The returned value should
+  /// be checked to see if it is inside a loop
+  static InsertionPoint findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc);
+
+private:
+  /// allocmem operations that DFA has determined are safe to move to the stack
+  /// mapping to where to insert replacement freemem operations
+  const llvm::DenseMap<mlir::Operation *, InsertionPoint> &candidateOps;
+
+  /// If we failed to find an insertion point not inside a loop, see if it would
+  /// be safe to use an llvm.stacksave/llvm.stackrestore inside the loop
+  static InsertionPoint findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc);
+
+  /// Returns the alloca if it was successfully inserted, otherwise {}
+  std::optional<fir::AllocaOp>
+  insertAlloca(fir::AllocMemOp &oldAlloc,
+               mlir::PatternRewriter &rewriter) const;
+
+  /// Inserts a stacksave before oldAlloc and a stackrestore after each freemem
+  void insertStackSaveRestore(fir::AllocMemOp &oldAlloc,
+                              mlir::PatternRewriter &rewriter) const;
+};
+
+class StackArraysPass : public fir::impl::StackArraysBase<StackArraysPass> {
+public:
+  StackArraysPass() = default;
+  StackArraysPass(const StackArraysPass &pass);
+
+  llvm::StringRef getDescription() const override;
+
+  void runOnOperation() override;
+  void runOnFunc(mlir::Operation *func);
+
+private:
+  Statistic runCount{this, "stackArraysRunCount",
+                     "Number of heap allocations moved to the stack"};
+};
+
+} // namespace
+
+static void print(llvm::raw_ostream &os, AllocationState state) {
+  switch (state) {
+  case AllocationState::Unknown:
+    os << "Unknown";
+    break;
+  case AllocationState::Freed:
+    os << "Freed";
+    break;
+  case AllocationState::Allocated:
+    os << "Allocated";
+    break;
+  }
+}
+
+/// Join two AllocationStates for the same value coming from 
diff erent CFG
+/// blocks
+static AllocationState join(AllocationState lhs, AllocationState rhs) {
+  //           | Allocated | Freed     | Unknown
+  // ========= | ========= | ========= | =========
+  // Allocated | Allocated | Unknown   | Unknown
+  // Freed     | Unknown   | Freed     | Unknown
+  // Unknown   | Unknown   | Unknown   | Unknown
+  if (lhs == rhs)
+    return lhs;
+  return AllocationState::Unknown;
+}
+
+mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) {
+  const auto &rhs = static_cast<const LatticePoint &>(lattice);
+  mlir::ChangeResult changed = mlir::ChangeResult::NoChange;
+
+  // add everything from rhs to map, handling cases where values are in both
+  for (const auto &[value, rhsState] : rhs.stateMap) {
+    auto it = stateMap.find(value);
+    if (it != stateMap.end()) {
+      // value is present in both maps
+      AllocationState myState = it->second;
+      AllocationState newState = ::join(myState, rhsState);
+      if (newState != myState) {
+        changed = mlir::ChangeResult::Change;
+        it->getSecond() = newState;
+      }
+    } else {
+      // value not present in current map: add it
+      stateMap.insert({value, rhsState});
+      changed = mlir::ChangeResult::Change;
+    }
+  }
+
+  return changed;
+}
+
+void LatticePoint::print(llvm::raw_ostream &os) const {
+  for (const auto &[value, state] : stateMap) {
+    os << value << ": ";
+    ::print(os, state);
+  }
+}
+
+mlir::ChangeResult LatticePoint::reset() {
+  if (stateMap.empty())
+    return mlir::ChangeResult::NoChange;
+  stateMap.clear();
+  return mlir::ChangeResult::Change;
+}
+
+mlir::ChangeResult LatticePoint::set(mlir::Value value, AllocationState state) {
+  if (stateMap.count(value)) {
+    // already in map
+    AllocationState &oldState = stateMap[value];
+    if (oldState != state) {
+      stateMap[value] = state;
+      return mlir::ChangeResult::Change;
+    }
+    return mlir::ChangeResult::NoChange;
+  }
+  stateMap.insert({value, state});
+  return mlir::ChangeResult::Change;
+}
+
+/// Get values which were allocated in this function and always freed before
+/// the function returns
+void LatticePoint::appendFreedValues(llvm::DenseSet<mlir::Value> &out) const {
+  for (auto &[value, state] : stateMap) {
+    if (state == AllocationState::Freed)
+      out.insert(value);
+  }
+}
+
+std::optional<AllocationState> LatticePoint::get(mlir::Value val) const {
+  auto it = stateMap.find(val);
+  if (it == stateMap.end())
+    return {};
+  return it->second;
+}
+
+void AllocationAnalysis::visitOperation(mlir::Operation *op,
+                                        const LatticePoint &before,
+                                        LatticePoint *after) {
+  LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op
+                          << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "--Lattice in: " << before << "\n");
+
+  // propagate before -> after
+  mlir::ChangeResult changed = after->join(before);
+
+  if (auto allocmem = mlir::dyn_cast<fir::AllocMemOp>(op)) {
+    assert(op->getNumResults() == 1 && "fir.allocmem has one result");
+    auto attr = op->getAttrOfType<fir::MustBeHeapAttr>(
+        fir::MustBeHeapAttr::getAttrName());
+    if (attr && attr.getValue()) {
+      LLVM_DEBUG(llvm::dbgs() << "--Found fir.must_be_heap: skipping\n");
+      // skip allocation marked not to be moved
+      return;
+    }
+
+    auto retTy = allocmem.getAllocatedType();
+    if (!retTy.isa<fir::SequenceType>()) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "--Allocation is not for an array: skipping\n");
+      return;
+    }
+
+    mlir::Value result = op->getResult(0);
+    changed |= after->set(result, AllocationState::Allocated);
+  } else if (mlir::isa<fir::FreeMemOp>(op)) {
+    assert(op->getNumOperands() == 1 && "fir.freemem has one operand");
+    mlir::Value operand = op->getOperand(0);
+    std::optional<AllocationState> operandState = before.get(operand);
+    if (operandState && *operandState == AllocationState::Allocated) {
+      // don't tag things not allocated in this function as freed, so that we
+      // don't think they are candidates for moving to the stack
+      changed |= after->set(operand, AllocationState::Freed);
+    }
+  } else if (mlir::isa<fir::ResultOp>(op)) {
+    mlir::Operation *parent = op->getParentOp();
+    LatticePoint *parentLattice = getLattice(parent);
+    assert(parentLattice);
+    mlir::ChangeResult parentChanged = parentLattice->join(*after);
+    propagateIfChanged(parentLattice, parentChanged);
+  }
+
+  // we pass lattices straight through fir.call because called functions should
+  // not deallocate flang-generated array temporaries
+
+  LLVM_DEBUG(llvm::dbgs() << "--Lattice out: " << *after << "\n");
+  propagateIfChanged(after, changed);
+}
+
+void AllocationAnalysis::setToEntryState(LatticePoint *lattice) {
+  propagateIfChanged(lattice, lattice->reset());
+}
+
+/// Mostly a copy of AbstractDenseLattice::processOperation - the 
diff erence
+/// being that call operations are passed through to the transfer function
+void AllocationAnalysis::processOperation(mlir::Operation *op) {
+  // If the containing block is not executable, bail out.
+  if (!getOrCreateFor<mlir::dataflow::Executable>(op, op->getBlock())->isLive())
+    return;
+
+  // Get the dense lattice to update
+  mlir::dataflow::AbstractDenseLattice *after = getLattice(op);
+
+  // If this op implements region control-flow, then control-flow dictates its
+  // transfer function.
+  if (auto branch = mlir::dyn_cast<mlir::RegionBranchOpInterface>(op))
+    return visitRegionBranchOperation(op, branch, after);
+
+  // pass call operations through to the transfer function
+
+  // Get the dense state before the execution of the op.
+  const mlir::dataflow::AbstractDenseLattice *before;
+  if (mlir::Operation *prev = op->getPrevNode())
+    before = getLatticeFor(op, prev);
+  else
+    before = getLatticeFor(op, op->getBlock());
+
+  /// Invoke the operation transfer function
+  visitOperationImpl(op, *before, after);
+}
+
+void StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
+  assert(mlir::isa<mlir::func::FuncOp>(func));
+  mlir::DataFlowSolver solver;
+  // constant propagation is required for dead code analysis, dead code analysis
+  // is required to mark blocks live (required for mlir dense dfa)
+  solver.load<mlir::dataflow::SparseConstantPropagation>();
+  solver.load<mlir::dataflow::DeadCodeAnalysis>();
+
+  auto [it, inserted] = funcMaps.try_emplace(func);
+  AllocMemMap &candidateOps = it->second;
+
+  solver.load<AllocationAnalysis>();
+  if (failed(solver.initializeAndRun(func))) {
+    llvm::errs() << "DataFlowSolver failed!";
+    gotError = true;
+    return;
+  }
+
+  LatticePoint point{func};
+  func->walk([&](mlir::func::ReturnOp child) {
+    const LatticePoint *lattice = solver.lookupState<LatticePoint>(child);
+    // there will be no lattice for an unreachable block
+    if (lattice)
+      point.join(*lattice);
+  });
+  llvm::DenseSet<mlir::Value> freedValues;
+  point.appendFreedValues(freedValues);
+
+  // We only replace allocations which are definately freed on all routes
+  // through the function because otherwise the allocation may have an intende
+  // lifetime longer than the current stack frame (e.g. a heap allocation which
+  // is then freed by another function).
+  for (mlir::Value freedValue : freedValues) {
+    fir::AllocMemOp allocmem = freedValue.getDefiningOp<fir::AllocMemOp>();
+    InsertionPoint insertionPoint =
+        AllocMemConversion::findAllocaInsertionPoint(allocmem);
+    if (insertionPoint)
+      candidateOps.insert({allocmem, insertionPoint});
+  }
+
+  LLVM_DEBUG(for (auto [allocMemOp, _]
+                  : candidateOps) {
+    llvm::dbgs() << "StackArrays: Found candidate op: " << *allocMemOp << '\n';
+  });
+}
+
+bool StackArraysAnalysisWrapper::hasErrors() const { return gotError; }
+
+const StackArraysAnalysisWrapper::AllocMemMap &
+StackArraysAnalysisWrapper::getCandidateOps(mlir::Operation *func) {
+  if (!funcMaps.count(func))
+    analyseFunction(func);
+  return funcMaps[func];
+}
+
+AllocMemConversion::AllocMemConversion(
+    mlir::MLIRContext *ctx,
+    const llvm::DenseMap<mlir::Operation *, InsertionPoint> &candidateOps)
+    : OpRewritePattern(ctx), candidateOps(candidateOps) {}
+
+mlir::LogicalResult
+AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem,
+                                    mlir::PatternRewriter &rewriter) const {
+  auto oldInsertionPt = rewriter.saveInsertionPoint();
+  // add alloca operation
+  std::optional<fir::AllocaOp> alloca = insertAlloca(allocmem, rewriter);
+  rewriter.restoreInsertionPoint(oldInsertionPt);
+  if (!alloca)
+    return mlir::failure();
+
+  // remove freemem operations
+  for (mlir::Operation *user : allocmem.getOperation()->getUsers())
+    if (mlir::isa<fir::FreeMemOp>(user))
+      rewriter.eraseOp(user);
+
+  // replace references to heap allocation with references to stack allocation
+  rewriter.replaceAllUsesWith(allocmem.getResult(), alloca->getResult());
+
+  // remove allocmem operation
+  rewriter.eraseOp(allocmem.getOperation());
+
+  return mlir::success();
+}
+
+// TODO: use mlir::blockIsInLoop once D141401 is merged
+static bool isInLoop(mlir::Block *block) {
+  mlir::Operation *parent = block->getParentOp();
+
+  // The block could be inside a loop-like operation
+  if (mlir::isa<mlir::LoopLikeOpInterface>(parent) ||
+      parent->getParentOfType<mlir::LoopLikeOpInterface>())
+    return true;
+
+  // This block might be nested inside another block, which is in a loop
+  if (!mlir::isa<mlir::FunctionOpInterface>(parent))
+    if (isInLoop(parent->getBlock()))
+      return true;
+
+  // Or the block could be inside a control flow graph loop:
+  // A block is in a control flow graph loop if it can reach itself in a graph
+  // traversal
+  llvm::DenseSet<mlir::Block *> visited;
+  llvm::SmallVector<mlir::Block *> stack;
+  stack.push_back(block);
+  while (!stack.empty()) {
+    mlir::Block *current = stack.pop_back_val();
+    auto [it, inserted] = visited.insert(current);
+    if (!inserted) {
+      // loop detected
+      if (current == block)
+        return true;
+      continue;
+    }
+
+    stack.reserve(stack.size() + current->getNumSuccessors());
+    for (mlir::Block *successor : current->getSuccessors())
+      stack.push_back(successor);
+  }
+  return false;
+}
+
+static bool isInLoop(mlir::Operation *op) {
+  return isInLoop(op->getBlock()) ||
+         op->getParentOfType<mlir::LoopLikeOpInterface>();
+}
+
+InsertionPoint
+AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
+  // Ideally the alloca should be inserted at the end of the function entry
+  // block so that we do not allocate stack space in a loop. However,
+  // the operands to the alloca may not be available that early, so insert it
+  // after the last operand becomes available
+  // If the old allocmem op was in an openmp region then it should not be moved
+  // outside of that
+  LLVM_DEBUG(llvm::dbgs() << "StackArrays: findAllocaInsertionPoint: "
+                          << oldAlloc << "\n");
+
+  // check that an Operation or Block we are about to return is not in a loop
+  auto checkReturn = [&](auto *point) -> InsertionPoint {
+    if (isInLoop(point)) {
+      mlir::Operation *oldAllocOp = oldAlloc.getOperation();
+      if (isInLoop(oldAllocOp)) {
+        // where we want to put it is in a loop, and even the old location is in
+        // a loop. Give up.
+        return findAllocaLoopInsertionPoint(oldAlloc);
+      }
+      return {oldAllocOp};
+    }
+    return {point};
+  };
+
+  auto oldOmpRegion =
+      oldAlloc->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
+
+  // Find when the last operand value becomes available
+  mlir::Block *operandsBlock = nullptr;
+  mlir::Operation *lastOperand = nullptr;
+  for (mlir::Value operand : oldAlloc.getOperands()) {
+    LLVM_DEBUG(llvm::dbgs() << "--considering operand " << operand << "\n");
+    mlir::Operation *op = operand.getDefiningOp();
+    if (!op)
+      return checkReturn(oldAlloc.getOperation());
+    if (!operandsBlock)
+      operandsBlock = op->getBlock();
+    else if (operandsBlock != op->getBlock()) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "----operand declared in a 
diff erent block!\n");
+      // Operation::isBeforeInBlock requires the operations to be in the same
+      // block. The best we can do is the location of the allocmem.
+      return checkReturn(oldAlloc.getOperation());
+    }
+    if (!lastOperand || lastOperand->isBeforeInBlock(op))
+      lastOperand = op;
+  }
+
+  if (lastOperand) {
+    // there were value operands to the allocmem so insert after the last one
+    LLVM_DEBUG(llvm::dbgs()
+               << "--Placing after last operand: " << *lastOperand << "\n");
+    // check we aren't moving out of an omp region
+    auto lastOpOmpRegion =
+        lastOperand->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
+    if (lastOpOmpRegion == oldOmpRegion)
+      return checkReturn(lastOperand);
+    // Presumably this happened because the operands became ready before the
+    // start of this openmp region. (lastOpOmpRegion != oldOmpRegion) should
+    // imply that oldOmpRegion comes after lastOpOmpRegion.
+    return checkReturn(oldOmpRegion.getAllocaBlock());
+  }
+
+  // There were no value operands to the allocmem so we are safe to insert it
+  // as early as we want
+
+  // handle openmp case
+  if (oldOmpRegion)
+    return checkReturn(oldOmpRegion.getAllocaBlock());
+
+  // fall back to the function entry block
+  mlir::func::FuncOp func = oldAlloc->getParentOfType<mlir::func::FuncOp>();
+  assert(func && "This analysis is run on func.func");
+  mlir::Block &entryBlock = func.getBlocks().front();
+  LLVM_DEBUG(llvm::dbgs() << "--Placing at the start of func entry block\n");
+  return checkReturn(&entryBlock);
+}
+
+InsertionPoint
+AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) {
+  mlir::Operation *oldAllocOp = oldAlloc;
+  // This is only called as a last resort. We should try to insert at the
+  // location of the old allocation, which is inside of a loop, using
+  // llvm.stacksave/llvm.stackrestore
+
+  // find freemem ops
+  llvm::SmallVector<mlir::Operation *, 1> freeOps;
+  for (mlir::Operation *user : oldAllocOp->getUsers())
+    if (mlir::isa<fir::FreeMemOp>(user))
+      freeOps.push_back(user);
+  assert(freeOps.size() && "DFA should only return freed memory");
+
+  // Don't attempt to reason about a stacksave/stackrestore between 
diff erent
+  // blocks
+  for (mlir::Operation *free : freeOps)
+    if (free->getBlock() != oldAllocOp->getBlock())
+      return {nullptr};
+
+  // Check that there aren't any other stack allocations in between the
+  // stack save and stack restore
+  // note: for flang generated temporaries there should only be one free op
+  for (mlir::Operation *free : freeOps) {
+    for (mlir::Operation *op = oldAlloc; op && op != free;
+         op = op->getNextNode()) {
+      if (mlir::isa<fir::AllocaOp>(op))
+        return {nullptr};
+    }
+  }
+
+  return InsertionPoint{oldAllocOp, /*shouldStackSaveRestore=*/true};
+}
+
+std::optional<fir::AllocaOp>
+AllocMemConversion::insertAlloca(fir::AllocMemOp &oldAlloc,
+                                 mlir::PatternRewriter &rewriter) const {
+  auto it = candidateOps.find(oldAlloc.getOperation());
+  if (it == candidateOps.end())
+    return {};
+  InsertionPoint insertionPoint = it->second;
+  if (!insertionPoint)
+    return {};
+
+  if (insertionPoint.shouldSaveRestoreStack())
+    insertStackSaveRestore(oldAlloc, rewriter);
+
+  mlir::Location loc = oldAlloc.getLoc();
+  mlir::Type varTy = oldAlloc.getInType();
+  if (mlir::Operation *op = insertionPoint.tryGetOperation()) {
+    rewriter.setInsertionPointAfter(op);
+  } else {
+    mlir::Block *block = insertionPoint.tryGetBlock();
+    assert(block && "There must be a valid insertion point");
+    rewriter.setInsertionPointToStart(block);
+  }
+
+  auto unpackName = [](std::optional<llvm::StringRef> opt) -> llvm::StringRef {
+    if (opt)
+      return *opt;
+    return {};
+  };
+
+  llvm::StringRef uniqName = unpackName(oldAlloc.getUniqName());
+  llvm::StringRef bindcName = unpackName(oldAlloc.getBindcName());
+  return rewriter.create<fir::AllocaOp>(loc, varTy, uniqName, bindcName,
+                                        oldAlloc.getTypeparams(),
+                                        oldAlloc.getShape());
+}
+
+void AllocMemConversion::insertStackSaveRestore(
+    fir::AllocMemOp &oldAlloc, mlir::PatternRewriter &rewriter) const {
+  auto oldPoint = rewriter.saveInsertionPoint();
+  auto mod = oldAlloc->getParentOfType<mlir::ModuleOp>();
+  fir::KindMapping kindMap = fir::getKindMapping(mod);
+  fir::FirOpBuilder builder{rewriter, kindMap};
+
+  mlir::func::FuncOp stackSaveFn = fir::factory::getLlvmStackSave(builder);
+  mlir::SymbolRefAttr stackSaveSym =
+      builder.getSymbolRefAttr(stackSaveFn.getName());
+
+  builder.setInsertionPoint(oldAlloc);
+  mlir::Value sp =
+      builder
+          .create<fir::CallOp>(oldAlloc.getLoc(),
+                               stackSaveFn.getFunctionType().getResults(),
+                               stackSaveSym, mlir::ValueRange{})
+          .getResult(0);
+
+  mlir::func::FuncOp stackRestoreFn =
+      fir::factory::getLlvmStackRestore(builder);
+  mlir::SymbolRefAttr stackRestoreSym =
+      builder.getSymbolRefAttr(stackRestoreFn.getName());
+
+  for (mlir::Operation *user : oldAlloc->getUsers()) {
+    if (mlir::isa<fir::FreeMemOp>(user)) {
+      builder.setInsertionPoint(user);
+      builder.create<fir::CallOp>(user->getLoc(),
+                                  stackRestoreFn.getFunctionType().getResults(),
+                                  stackRestoreSym, mlir::ValueRange{sp});
+    }
+  }
+
+  rewriter.restoreInsertionPoint(oldPoint);
+}
+
+StackArraysPass::StackArraysPass(const StackArraysPass &pass)
+    : fir::impl::StackArraysBase<StackArraysPass>(pass) {}
+
+llvm::StringRef StackArraysPass::getDescription() const {
+  return "Move heap allocated array temporaries to the stack";
+}
+
+void StackArraysPass::runOnOperation() {
+  mlir::ModuleOp mod = getOperation();
+
+  mod.walk([this](mlir::func::FuncOp func) { runOnFunc(func); });
+}
+
+void StackArraysPass::runOnFunc(mlir::Operation *func) {
+  assert(mlir::isa<mlir::func::FuncOp>(func));
+
+  auto &analysis = getAnalysis<StackArraysAnalysisWrapper>();
+  const auto &candidateOps = analysis.getCandidateOps(func);
+  if (analysis.hasErrors()) {
+    signalPassFailure();
+    return;
+  }
+
+  if (candidateOps.empty())
+    return;
+  runCount += candidateOps.size();
+
+  mlir::MLIRContext &context = getContext();
+  mlir::RewritePatternSet patterns(&context);
+  mlir::ConversionTarget target(context);
+
+  target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
+                         mlir::func::FuncDialect>();
+  target.addDynamicallyLegalOp<fir::AllocMemOp>([&](fir::AllocMemOp alloc) {
+    return !candidateOps.count(alloc.getOperation());
+  });
+
+  patterns.insert<AllocMemConversion>(&context, candidateOps);
+  if (mlir::failed(
+          mlir::applyPartialConversion(func, target, std::move(patterns)))) {
+    mlir::emitError(func->getLoc(), "error in stack arrays optimization\n");
+    signalPassFailure();
+  }
+}
+
+std::unique_ptr<mlir::Pass> fir::createStackArraysPass() {
+  return std::make_unique<StackArraysPass>();
+}

diff  --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
index 0c353a3ea730f..361d72277cfea 100644
--- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
+++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
@@ -20,7 +20,7 @@ subroutine allocation(x)
 ! CHECK:  %[[VAL_12:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index
 ! CHECK:  %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index
-! CHECK:  %[[VAL_15:.*]] = fir.allocmem !fir.array<?x!fir.char<1,?>>(%[[VAL_2]] : index), %[[VAL_14]] {uniq_name = "_QFallocationEx.alloc"}
+! CHECK:  %[[VAL_15:.*]] = fir.allocmem !fir.array<?x!fir.char<1,?>>(%[[VAL_2]] : index), %[[VAL_14]] {fir.must_be_heap = true, uniq_name = "_QFallocationEx.alloc"}
 ! CHECK:  %[[VAL_16:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_17:.*]] = fir.embox %[[VAL_15]](%[[VAL_16]]) typeparams %[[VAL_2]] : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>
 ! CHECK:  fir.store %[[VAL_17]] to %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
@@ -84,7 +84,7 @@ subroutine alloc_comp(x)
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index
 ! CHECK:  %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index
-! CHECK:  %[[VAL_12:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_11]] {uniq_name = "_QEa.alloc"}
+! CHECK:  %[[VAL_12:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_11]] {fir.must_be_heap = true, uniq_name = "_QEa.alloc"}
 ! CHECK:  %[[VAL_13:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_14:.*]] = fir.embox %[[VAL_12]](%[[VAL_13]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:  fir.store %[[VAL_14]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>

diff  --git a/flang/test/Lower/Intrinsics/c_loc.f90 b/flang/test/Lower/Intrinsics/c_loc.f90
index 34b0fb2fd833b..e800fe9529d56 100644
--- a/flang/test/Lower/Intrinsics/c_loc.f90
+++ b/flang/test/Lower/Intrinsics/c_loc.f90
@@ -177,7 +177,7 @@ subroutine c_loc_arraysection()
 ! CHECK:         %[[VAL_2:.*]] = fir.zero_bits !fir.ptr<i32>
 ! CHECK:         fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
 ! CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "ptr", uniq_name = "_QFc_loc_non_save_pointer_scalarEptr"}
-! CHECK:         %[[VAL_4:.*]] = fir.allocmem i32 {uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"}
+! CHECK:         %[[VAL_4:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"}
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<i32>) -> !fir.ptr<i32>
 ! CHECK:         fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 10 : i32

diff  --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90
index 0e8b61f6c5a6d..ade035501ea43 100644
--- a/flang/test/Lower/Intrinsics/system_clock.f90
+++ b/flang/test/Lower/Intrinsics/system_clock.f90
@@ -43,7 +43,7 @@ subroutine ss(count)
   ! CHECK:   %[[V_6:[0-9]+]] = fir.alloca i64 {bindc_name = "count_rate_", fir.target, uniq_name = "_QFssEcount_rate_"}
   ! CHECK:   %[[V_7:[0-9]+]] = fir.convert %[[V_6]] : (!fir.ref<i64>) -> !fir.ptr<i64>
   ! CHECK:   fir.store %[[V_7]] to %[[V_4]] : !fir.ref<!fir.ptr<i64>>
-  ! CHECK:   %[[V_8:[0-9]+]] = fir.allocmem i64 {uniq_name = "_QFssEcount_max.alloc"}
+  ! CHECK:   %[[V_8:[0-9]+]] = fir.allocmem i64 {fir.must_be_heap = true, uniq_name = "_QFssEcount_max.alloc"}
   ! CHECK:   fir.store %[[V_8]] to %[[V_1]] : !fir.ref<!fir.heap<i64>>
   ! CHECK:   %[[V_9:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<!fir.ptr<i64>>
   ! CHECK:   %[[V_10:[0-9]+]] = fir.load %[[V_1]] : !fir.ref<!fir.heap<i64>>

diff  --git a/flang/test/Transforms/stack-arrays.f90 b/flang/test/Transforms/stack-arrays.f90
new file mode 100644
index 0000000000000..7622e679f9196
--- /dev/null
+++ b/flang/test/Transforms/stack-arrays.f90
@@ -0,0 +1,140 @@
+! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --array-value-copy | fir-opt --stack-arrays | FileCheck %s
+
+! check simple array value copy case
+subroutine array_value_copy_simple(arr)
+  integer, intent(inout) :: arr(4)
+  arr(3:4) = arr(1:2)
+end subroutine
+! CHECK-LABEL: func.func @_QParray_value_copy_simple(%arg0: !fir.ref<!fir.array<4xi32>>
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: fir.alloca !fir.array<4xi32>
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }
+
+! check complex array value copy case
+module stuff
+  type DerivedWithAllocatable
+    integer, dimension(:), allocatable :: dat
+  end type
+
+  contains
+  subroutine array_value_copy_complex(arr)
+    type(DerivedWithAllocatable), intent(inout) :: arr(:)
+    arr(3:4) = arr(1:2)
+  end subroutine
+end module
+! CHECK: func.func
+! CHECK-SAME: array_value_copy_complex
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: fir.alloca !fir.array<?x!fir.type<_QMstuffTderivedwithallocatable
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine parameter_array_init
+  integer, parameter :: p(100) = 42
+  call use_p(p)
+end subroutine
+! CHECK: func.func
+! CHECK-SAME: parameter_array_init
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: fir.alloca !fir.array<100xi32>
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine test_vector_subscripted_section_to_box(v, x)
+  interface
+    subroutine takes_box(y)
+      real :: y(:)
+    end subroutine
+  end interface
+
+  integer :: v(:)
+  real :: x(:)
+  call takes_box(x(v))
+end subroutine
+! CHECK: func.func
+! CHECK-SAME: test_vector_subscripted_section_to_box
+! CHECK-NOT: fir.allocmem
+! CHECK: fir.alloca !fir.array<?xf32>
+! CHECK-NOT: fir.allocmem
+! CHECK: fir.call @_QPtakes_box
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine call_parenthesized_arg(x)
+  integer :: x(100)
+  call bar((x))
+end subroutine
+! CHECK: func.func
+! CHECK-SAME: call_parenthesized_arg
+! CHECK-NOT: fir.allocmem
+! CHECK: fir.alloca !fir.array<100xi32>
+! CHECK-NOT: fir.allocmem
+! CHECK: fir.call @_QPbar
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine where_allocatable_assignments(a, b)
+  integer :: a(:)
+  integer, allocatable :: b(:)
+  where(b > 0)
+    b = a
+  elsewhere
+    b(:) = 0
+  end where
+end subroutine
+! TODO: broken: passing allocation through fir.result
+! CHECK: func.func
+! CHECK-SAME: where_allocatable_assignments
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine array_constructor(a, b)
+  real :: a(5), b
+  real, external :: f
+  a = [f(b), f(b+1), f(b+2), f(b+5), f(b+11)]
+end subroutine
+! TODO: broken: realloc
+! CHECK: func.func
+! CHECK-SAME: array_constructor
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine sequence(seq, n)
+  integer :: n, seq(n)
+  seq = [(i,i=1,n)]
+end subroutine
+! TODO: broken: realloc
+! CHECK: func.func
+! CHECK-SAME: sequence
+! CHECK: return
+! CHECK-NEXT: }
+
+subroutine CFGLoop(x)
+  integer, parameter :: k = 100, m=1000000, n = k*m
+  integer :: x(n)
+  logical :: has_error
+
+  do i=0,m-1
+    x(k*i+1:k*(i+1)) = x(k*(i+1):k*i+1:-1)
+    if (has_error(x, k)) stop
+  end do
+end subroutine
+! CHECK: func.func
+! CHECK-SAME: cfgloop
+! CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<100000000xi32>
+! CHECK-NOT: fir.allocmem
+! CHECK-NOT: fir.freemem
+! CHECK: return
+! CHECK-NEXT: }

diff  --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
new file mode 100644
index 0000000000000..d34d662ca2c43
--- /dev/null
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -0,0 +1,309 @@
+// RUN: fir-opt --stack-arrays %s | FileCheck %s
+
+// Simplest transformation
+func.func @simple() {
+  %0 = fir.allocmem !fir.array<42xi32>
+  fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK: func.func @simple() {
+// CHECK-NEXT: fir.alloca !fir.array<42xi32>
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// Check fir.must_be_heap allocations are not moved
+func.func @must_be_heap() {
+  %0 = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true}
+  fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK:      func.func @must_be_heap() {
+// CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true}
+// CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// Check the data-flow-analysis can detect cases where we aren't sure if memory
+// is freed by the end of the function
+func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
+  %7 = arith.constant 42 : index
+  %8 = fir.allocmem !fir.array<?xi32>, %7 {uniq_name = "_QFdfa1Earr.alloc"}
+  %9 = fir.load %arg0 : !fir.ref<!fir.logical<4>>
+  %10 = fir.convert %9 : (!fir.logical<4>) -> i1
+  fir.if %10 {
+    fir.freemem %8 : !fir.heap<!fir.array<?xi32>>
+  } else {
+  }
+  return
+}
+// CHECK:      func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
+// CHECK-NEXT:   %[[C42:.*]] = arith.constant 42 : index
+// CHECK-NEXT:   %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"}
+// CHECK-NEXT:   %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref<!fir.logical<4>>
+// CHECK-NEXT:   %[[BOOL:.*]] = fir.convert %[[LOGICAL]] : (!fir.logical<4>) -> i1
+// CHECK-NEXT:   fir.if %[[BOOL]] {
+// CHECK-NEXT:     fir.freemem %[[MEM]] : !fir.heap<!fir.array<?xi32>>
+// CHECK-NEXT:   } else {
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// Check scf.if (fir.if is not considered a branch operation)
+func.func @dfa2(%arg0: i1) {
+  %a = fir.allocmem !fir.array<1xi8>
+  scf.if %arg0 {
+    fir.freemem %a : !fir.heap<!fir.array<1xi8>>
+  } else {
+  }
+  return
+}
+// CHECK:     func.func @dfa2(%arg0: i1) {
+// CHECK-NEXT:  %[[MEM:.*]] = fir.allocmem !fir.array<1xi8>
+// CHECK-NEXT:  scf.if %arg0 {
+// CHECK-NEXT:    fir.freemem %[[MEM]] : !fir.heap<!fir.array<1xi8>>
+// CHECK-NEXT:  } else {
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+// CHECK-NEXT:  }
+
+// check the alloca is placed after all operands become available
+func.func @placement1() {
+  // do some stuff with other ssa values
+  %1 = arith.constant 1 : index
+  %2 = arith.constant 2 : index
+  %3 = arith.addi %1, %2 : index
+  // operand is now available
+  %4 = fir.allocmem !fir.array<?xi32>, %3
+  // ...
+  fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  return
+}
+// CHECK:      func.func @placement1() {
+// CHECK-NEXT:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[ARG:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
+// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[ARG]]
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// check that if there are no operands, then the alloca is placed early
+func.func @placement2() {
+  // do some stuff with other ssa values
+  %1 = arith.constant 1 : index
+  %2 = arith.constant 2 : index
+  %3 = arith.addi %1, %2 : index
+  %4 = fir.allocmem !fir.array<42xi32>
+  // ...
+  fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK:      func.func @placement2() {
+// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK-NEXT:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// check that stack allocations which must be placed in loops use stacksave
+func.func @placement3() {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = fir.convert %c1 : (index) -> i32
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) {
+    %3 = arith.addi %c1, %c2 : index
+    // operand is now available
+    %4 = fir.allocmem !fir.array<?xi32>, %3
+    // ...
+    fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+    fir.result %3, %c1_i32 : index, i32
+  }
+  return
+}
+// CHECK:      func.func @placement3() {
+// CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
+// CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:   fir.do_loop
+// CHECK-NEXT:     %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
+// CHECK-NEXT:     %[[SP:.*]] = fir.call @llvm.stacksave() : () -> !fir.ref<i8>
+// CHECK-NEXT:     %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]]
+// CHECK-NEXT:     fir.call @llvm.stackrestore(%[[SP]])
+// CHECK-NEXT:     fir.result
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// check that stack save/restore are used in CFG loops
+func.func @placement4(%arg0 : i1) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = fir.convert %c1 : (index) -> i32
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  cf.br ^bb1
+^bb1:
+  %3 = arith.addi %c1, %c2 : index
+  // operand is now available
+  %4 = fir.allocmem !fir.array<?xi32>, %3
+  // ...
+  fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  cf.cond_br %arg0, ^bb1, ^bb2
+^bb2:
+  return
+}
+// CHECK:      func.func @placement4(%arg0: i1) {
+// CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
+// CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:   cf.br ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
+// CHECK-NEXT:   %[[SP:.*]] = fir.call @llvm.stacksave() : () -> !fir.ref<i8>
+// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]]
+// CHECK-NEXT:   fir.call @llvm.stackrestore(%[[SP]]) : (!fir.ref<i8>) -> ()
+// CHECK-NEXT:   cf.cond_br %arg0, ^bb1, ^bb2
+// CHECK-NEXT: ^bb2:
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// check that stacksave is not used when there is an intervening alloca
+func.func @placement5() {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = fir.convert %c1 : (index) -> i32
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) {
+    %3 = arith.addi %c1, %c2 : index
+    // operand is now available
+    %4 = fir.allocmem !fir.array<?xi32>, %3
+    %5 = fir.alloca i32
+    fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+    fir.result %3, %c1_i32 : index, i32
+  }
+  return
+}
+// CHECK:      func.func @placement5() {
+// CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
+// CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:   fir.do_loop
+// CHECK-NEXT:     %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
+// CHECK-NEXT:     %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[SUM]]
+// CHECK-NEXT:     %[[IDX:.*]] = fir.alloca i32
+// CHECK-NEXT:     fir.freemem %[[MEM]] : !fir.heap<!fir.array<?xi32>>
+// CHECK-NEXT:     fir.result
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// check that stack save/restore are not used when the memalloc and freemem are
+// in 
diff erent blocks
+func.func @placement6(%arg0: i1) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = fir.convert %c1 : (index) -> i32
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  cf.br ^bb1
+^bb1:
+  %3 = arith.addi %c1, %c2 : index
+  // operand is now available
+  %4 = fir.allocmem !fir.array<?xi32>, %3
+  // ...
+  cf.cond_br %arg0, ^bb2, ^bb3
+^bb2:
+  // ...
+  fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  cf.br ^bb1
+^bb3:
+  // ...
+  fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  cf.br ^bb1
+}
+// CHECK:      func.func @placement6(%arg0: i1) {
+// CHECK-NEXT:   %[[c1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32
+// CHECK-NEXT:   %[[c2:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[c10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:   cf.br ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   %[[ADD:.*]] = arith.addi %[[c1]], %[[c2]] : index
+// CHECK-NEXT:   %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[ADD]]
+// CHECK-NEXT:   cf.cond_br %arg0, ^bb2, ^bb3
+// CHECK-NEXT: ^bb2:
+// CHECK-NEXT:   fir.freemem %[[MEM]] : !fir.heap<!fir.array<?xi32>>
+// CHECK-NEXT:   cf.br ^bb1
+// CHECK-NEXT: ^bb3:
+// CHECK-NEXT:   fir.freemem %[[MEM]] : !fir.heap<!fir.array<?xi32>>
+// CHECK-NEXT:   cf.br ^bb1
+// CHECK-NEXT: }
+
+// Check multiple returns, where the memory is always freed
+func.func @returns(%arg0: i1) {
+  %0 = fir.allocmem !fir.array<42xi32>
+  cf.cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
+  return
+^bb2:
+  fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK:      func.func @returns(%[[COND:.*]]: i1) {
+// CHECK-NEXT:   %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   return
+// CHECK-NEXT: ^bb2:
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// Check multiple returns, where the memory is not freed on one branch
+func.func @returns2(%arg0: i1) {
+  %0 = fir.allocmem !fir.array<42xi32>
+  cf.cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
+  return
+^bb2:
+  return
+}
+// CHECK:      func.func @returns2(%[[COND:.*]]: i1) {
+// CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32>
+// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
+// CHECK-NEXT:   return
+// CHECK-NEXT: ^bb2:
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// Check allocations are not moved outside of an omp region
+func.func @omp_placement1() {
+  omp.sections {
+    omp.section {
+      %mem = fir.allocmem !fir.array<42xi32>
+      fir.freemem %mem : !fir.heap<!fir.array<42xi32>>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+// CHECK:      func.func @omp_placement1() {
+// CHECK-NEXT:   omp.sections {
+// CHECK-NEXT:     omp.section {
+// CHECK-NEXT:       %[[MEM:.*]] = fir.allocmem !fir.array<42xi32>
+// TODO: this allocation should be moved to the stack. Unfortunately, the data
+// flow analysis fails to propogate the lattice out of the omp region to the
+// return satement.
+// CHECK-NEXT:       fir.freemem %[[MEM]] : !fir.heap<!fir.array<42xi32>>
+// CHECK-NEXT:       omp.terminator
+// CHECK-NEXT:     }
+// CHECK-NEXT:     omp.terminator
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }