[Mlir-commits] [mlir] 1b1c61f - [mlir] Refactored BufferPlacement transformation.

Mon Oct 19 03:54:50 PDT 2020

Author: Marcel Koester
Date: 2020-10-19T12:52:16+02:00
New Revision: 1b1c61ff47f8263680cbbf48f7d95f8bddd2386e

URL: https://github.com/llvm/llvm-project/commit/1b1c61ff47f8263680cbbf48f7d95f8bddd2386e
DIFF: https://github.com/llvm/llvm-project/commit/1b1c61ff47f8263680cbbf48f7d95f8bddd2386e.diff

LOG: [mlir] Refactored BufferPlacement transformation.

The current BufferPlacement transformation contains several concepts for
hoisting allocations. However, more advanced hoisting techniques should not be
integrated into the BufferPlacement transformation. Hence, this CL refactors the
current BufferPlacement pass into three separate pieces: BufferDeallocation and
BufferAllocation(Loop)Hoisting. Moreover, it extends the hoisting functionality
by allowing to move allocations out of loops.

Differential Revision: https://reviews.llvm.org/D87756

Added: 
    mlir/lib/Transforms/BufferDeallocation.cpp
    mlir/lib/Transforms/BufferOptimizations.cpp
    mlir/test/Transforms/buffer-deallocation.mlir
    mlir/test/Transforms/buffer-hoisting.mlir
    mlir/test/Transforms/buffer-loop-hoisting.mlir

Modified: 
    mlir/include/mlir/Transforms/Bufferize.h
    mlir/include/mlir/Transforms/Passes.h
    mlir/include/mlir/Transforms/Passes.td
    mlir/lib/Transforms/CMakeLists.txt
    mlir/test/Dialect/Linalg/bufferize.mlir

Removed: 
    mlir/lib/Transforms/BufferPlacement.cpp
    mlir/test/Transforms/buffer-placement.mlir


################################################################################
diff  --git a/mlir/include/mlir/Transforms/Bufferize.h b/mlir/include/mlir/Transforms/Bufferize.h
index eebebaae54e3..ddc00893cc47 100644

--- a/mlir/include/mlir/Transforms/Bufferize.h
+++ b/mlir/include/mlir/Transforms/Bufferize.h
@@ -28,8 +28,10 @@
 #ifndef MLIR_TRANSFORMS_BUFFERIZE_H
 #define MLIR_TRANSFORMS_BUFFERIZE_H
 
+#include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -276,6 +278,142 @@ populateWithBufferizeOpConversionPatterns(MLIRContext *context,
   >(context, converter);
   // clang-format on
 }
+
+/// A straight-forward alias analysis which ensures that all aliases of all
+/// values will be determined. This is a requirement for the BufferPlacement
+/// class since you need to determine safe positions to place alloc and
+/// deallocs.
+class BufferPlacementAliasAnalysis {
+public:
+  using ValueSetT = SmallPtrSet<Value, 16>;
+  using ValueMapT = llvm::DenseMap<Value, ValueSetT>;
+
+public:
+  /// Constructs a new alias analysis using the op provided.
+  BufferPlacementAliasAnalysis(Operation *op);
+
+  /// Find all immediate aliases this value could potentially have.
+  ValueMapT::const_iterator find(Value value) const {
+    return aliases.find(value);
+  }
+
+  /// Returns the begin iterator to iterate over all aliases.
+  ValueMapT::const_iterator begin() const { return aliases.begin(); }
+
+  /// Returns the end iterator that can be used in combination with find.
+  ValueMapT::const_iterator end() const { return aliases.end(); }
+
+  /// Find all immediate and indirect aliases this value could potentially
+  /// have. Note that the resulting set will also contain the value provided as
+  /// it is an alias of itself.
+  ValueSetT resolve(Value value) const;
+
+  /// Removes the given values from all alias sets.
+  void remove(const SmallPtrSetImpl<Value> &aliasValues);
+
+private:
+  /// This function constructs a mapping from values to its immediate aliases.
+  void build(Operation *op);
+
+  /// Maps values to all immediate aliases this value can have.
+  ValueMapT aliases;
+};
+
+/// A simple analysis that detects allocation operations.
+class BufferPlacementAllocs {
+public:
+  /// Represents a tuple of allocValue and deallocOperation.
+  using AllocEntry = std::tuple<Value, Operation *>;
+
+  /// Represents a list containing all alloc entries.
+  using AllocEntryList = SmallVector<AllocEntry, 8>;
+
+  /// Get the start operation to place the given alloc value withing the
+  // specified placement block.
+  static Operation *getStartOperation(Value allocValue, Block *placementBlock,
+                                      const Liveness &liveness);
+
+  /// Find an associated dealloc operation that is linked to the given
+  /// allocation node (if any).
+  static Operation *findDealloc(Value allocValue);
+
+public:
+  /// Initializes the internal list by discovering all supported allocation
+  /// nodes.
+  BufferPlacementAllocs(Operation *op);
+
+  /// Returns the begin iterator to iterate over all allocations.
+  AllocEntryList::const_iterator begin() const { return allocs.begin(); }
+
+  /// Returns the end iterator that can be used in combination with begin.
+  AllocEntryList::const_iterator end() const { return allocs.end(); }
+
+  /// Returns the begin iterator to iterate over all allocations.
+  AllocEntryList::iterator begin() { return allocs.begin(); }
+
+  /// Returns the end iterator that can be used in combination with begin.
+  AllocEntryList::iterator end() { return allocs.end(); }
+
+  /// Registers a new allocation entry.
+  void registerAlloc(const AllocEntry &entry) { allocs.push_back(entry); }
+
+private:
+  /// Searches for and registers all supported allocation entries.
+  void build(Operation *op);
+
+private:
+  /// Maps allocation nodes to their associated blocks.
+  AllocEntryList allocs;
+};
+
+/// The base class for all BufferPlacement transformations.
+class BufferPlacementTransformationBase {
+public:
+  using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT;
+
+  /// Finds a common dominator for the given value while taking the positions
+  /// of the values in the value set into account. It supports dominator and
+  /// post-dominator analyses via template arguments.
+  template <typename DominatorT>
+  static Block *findCommonDominator(Value value, const ValueSetT &values,
+                                    const DominatorT &doms) {
+    // Start with the current block the value is defined in.
+    Block *dom = value.getParentBlock();
+    // Iterate over all aliases and their uses to find a safe placement block
+    // according to the given dominator information.
+    for (Value childValue : values) {
+      for (Operation *user : childValue.getUsers()) {
+        // Move upwards in the dominator tree to find an appropriate
+        // dominator block that takes the current use into account.
+        dom = doms.findNearestCommonDominator(dom, user->getBlock());
+      }
+      // Take values without any users into account.
+      dom = doms.findNearestCommonDominator(dom, childValue.getParentBlock());
+    }
+    return dom;
+  }
+
+  /// Returns true if the given operation represents a loop by testing whether
+  /// it implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`.
+  /// In the case of a `RegionBranchOpInterface`, it checks all region-based
+  /// control-flow edges for cycles.
+  static bool isLoop(Operation *op);
+
+  /// Constructs a new operation base using the given root operation.
+  BufferPlacementTransformationBase(Operation *op);
+
+protected:
+  /// Alias information that can be updated during the insertion of copies.
+  BufferPlacementAliasAnalysis aliases;
+
+  /// Stores all internally managed allocations.
+  BufferPlacementAllocs allocs;
+
+  /// The underlying liveness analysis to compute fine grained information
+  /// about alloc and dealloc positions.
+  Liveness liveness;
+};
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_BUFFERIZE_H

diff  --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index ef8524ebd28a..54d348fd3a63 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -28,8 +28,17 @@ class AffineForOp;
 // Passes
 //===----------------------------------------------------------------------===//
 
-/// Creates an instance of the BufferPlacement pass.
-std::unique_ptr<Pass> createBufferPlacementPass();
+/// Creates an instance of the BufferDeallocation pass to free all allocated
+/// buffers.
+std::unique_ptr<Pass> createBufferDeallocationPass();
+
+/// Creates a pass that moves allocations upwards to reduce the number of
+/// required copies that are inserted during the BufferDeallocation pass.
+std::unique_ptr<Pass> createBufferHoistingPass();
+
+/// Creates a pass that moves allocations upwards out of loops. This avoids
+/// reallocations inside of loops.
+std::unique_ptr<Pass> createBufferLoopHoistingPass();
 
 /// Creates an instance of the Canonicalizer pass.
 std::unique_ptr<Pass> createCanonicalizerPass();

diff  --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 367e19cfcd55..8174ecd2021a 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -102,12 +102,13 @@ def AffinePipelineDataTransfer
   let constructor = "mlir::createPipelineDataTransferPass()";
 }
 
-def BufferPlacement : FunctionPass<"buffer-placement"> {
-  let summary = "Optimizes placement of alloc and dealloc operations";
+def BufferDeallocation : FunctionPass<"buffer-deallocation"> {
+  let summary = "Adds all required dealloc operations for all allocations in the "
+                "input program";
   let description = [{
-    This pass implements an algorithm to optimize the placement of alloc and
-    dealloc operations. This pass also inserts missing dealloc operations
-    automatically to reclaim memory.
+    This pass implements an algorithm to automatically introduce all required
+    deallocation operations for all buffers in the input program. This ensures that
+    the resulting program does not have any memory leaks.
 
 
     Input
@@ -121,7 +122,11 @@ def BufferPlacement : FunctionPass<"buffer-placement"> {
         br ^bb3(%arg1 : memref<2xf32>)
       ^bb2:
         %0 = alloc() : memref<2xf32>
-        linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 {
+        linalg.generic {
+          args_in = 1 : i64,
+          args_out = 1 : i64,
+          indexing_maps = [#map0, #map0],
+          iterator_types = ["parallel"]} %arg1, %0 {
         ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
           %tmp1 = exp %gen1_arg0 : f32
           linalg.yield %tmp1 : f32
@@ -141,31 +146,61 @@ def BufferPlacement : FunctionPass<"buffer-placement"> {
     #map0 = affine_map<(d0) -> (d0)>
     module {
       func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
-        %0 = alloc() : memref<2xf32>
         cond_br %arg0, ^bb1, ^bb2
-      ^bb1: // pred: ^bb0
-        br ^bb3(%arg1 : memref<2xf32>)
-      ^bb2: // pred: ^bb0
-        linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg1, %0 {
-        ^bb0(%arg3: f32, %arg4: f32):       // no predecessors
-          %2 = exp %arg3 : f32
-          linalg.yield %2 : f32
-        }: memref<2xf32>, memref<2xf32>
+      ^bb1:  // pred: ^bb0
+        %0 = alloc() : memref<2xf32>
+        linalg.copy(%arg1, %0) : memref<2xf32>, memref<2xf32>
         br ^bb3(%0 : memref<2xf32>)
-      ^bb3(%1: memref<2xf32>):      // 2 preds: ^bb1, ^bb2
-        linalg.copy(%1, %arg2) : memref<2xf32>, memref<2xf32>
-        dealloc %0 : memref<2xf32>
+      ^bb2:  // pred: ^bb0
+        %1 = alloc() : memref<2xf32>
+        linalg.generic {
+          args_in = 1 : i64,
+          args_out = 1 : i64,
+          indexing_maps = [#map0, #map0],
+          iterator_types = ["parallel"]} %arg1, %1 {
+        ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
+          %4 = exp %arg3 : f32
+          linalg.yield %4 : f32
+        }: memref<2xf32>, memref<2xf32>
+        %2 = alloc() : memref<2xf32>
+        linalg.copy(%1, %2) : memref<2xf32>, memref<2xf32>
+        dealloc %1 : memref<2xf32>
+        br ^bb3(%2 : memref<2xf32>)
+      ^bb3(%3: memref<2xf32>):  // 2 preds: ^bb1, ^bb2
+        linalg.copy(%3, %arg2) : memref<2xf32>, memref<2xf32>
+        dealloc %3 : memref<2xf32>
         return
       }
+
     }
     ```
 
   }];
-  let constructor = "mlir::createBufferPlacementPass()";
+  let constructor = "mlir::createBufferDeallocationPass()";
   // TODO: this pass likely shouldn't depend on Linalg?
   let dependentDialects = ["linalg::LinalgDialect"];
 }
 
+def BufferHoisting : FunctionPass<"buffer-hoisting"> {
+  let summary = "Optimizes placement of allocation operations by moving them "
+                "into common dominators and out of nested regions";
+  let description = [{
+    This pass implements an approach to aggressively move allocations upwards
+    into common dominators and out of nested regions.
+  }];
+  let constructor = "mlir::createBufferHoistingPass()";
+}
+
+def BufferLoopHoisting : FunctionPass<"buffer-loop-hoisting"> {
+  let summary = "Optimizes placement of allocation operations by moving them "
+                "out of loop nests";
+  let description = [{
+    This pass implements an approach to aggressively move allocations upwards
+    out of loop nests. It does not move allocations into common dominators.
+  }];
+  let constructor = "mlir::createBufferLoopHoistingPass()";
+}
+
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{

diff  --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp
similarity index 58%
rename from mlir/lib/Transforms/BufferPlacement.cpp
rename to mlir/lib/Transforms/BufferDeallocation.cpp
index e05169f99aea..5461f27ff275 100644
--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferDeallocation.cpp
@@ -1,4 +1,4 @@
-//===- BufferPlacement.cpp - the impl for buffer placement ---------------===//
+//===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,14 +8,15 @@
 //
 // This file implements logic for computing correct alloc and dealloc positions.
 // Furthermore, buffer placement also adds required new alloc and copy
-// operations to ensure that all buffers are deallocated.The main class is the
-// BufferPlacementPass class that implements the underlying algorithm. In order
-// to put allocations and deallocations at safe positions, it is significantly
-// important to put them into the correct blocks. However, the liveness analysis
-// does not pay attention to aliases, which can occur due to branches (and their
-// associated block arguments) in general. For this purpose, BufferPlacement
-// firstly finds all possible aliases for a single value (using the
-// BufferPlacementAliasAnalysis class). Consider the following example:
+// operations to ensure that all buffers are deallocated. The main class is the
+// BufferDeallocationPass class that implements the underlying algorithm. In
+// order to put allocations and deallocations at safe positions, it is
+// significantly important to put them into the correct blocks. However, the
+// liveness analysis does not pay attention to aliases, which can occur due to
+// branches (and their associated block arguments) in general. For this purpose,
+// BufferDeallocation firstly finds all possible aliases for a single value
+// (using the BufferPlacementAliasAnalysis class). Consider the following
+// example:
 //
 // ^bb0(%arg0):
 //   cond_br %cond, ^bb1, ^bb2
@@ -27,13 +28,9 @@
 // ^exit(%arg1):
 //   return %arg1;
 //
-// Using liveness information on its own would cause us to place the allocs and
-// deallocs in the wrong block. This is due to the fact that %new_value will not
-// be liveOut of its block. Instead, we can place the alloc for %new_value
-// in bb0 and its associated dealloc in exit. Alternatively, the alloc can stay
-// (or even has to stay due to additional dependencies) at this location and we
-// have to free the buffer in the same block, because it cannot be freed in the
-// post dominator. However, this requires a new copy buffer for %arg1 that will
+// We should place the dealloc for %new_value in exit. However, we have to free
+// the buffer in the same block, because it cannot be freed in the post
+// dominator. However, this requires a new copy buffer for %arg1 that will
 // contain the actual contents. Using the class BufferPlacementAliasAnalysis, we
 // will find out that %new_value has a potential alias %arg1. In order to find
 // the dealloc position we have to find all potential aliases, iterate over
@@ -42,10 +39,9 @@
 // of the deallocs). In all cases, the computed block can be safely used to free
 // the %new_value buffer (may be exit or bb2) as it will die and we can use
 // liveness information to determine the exact operation after which we have to
-// insert the dealloc. Finding the alloc position is similar and non-obvious.
-// However, the algorithm supports moving allocs to other places and introducing
-// copy buffers and placing deallocs in safe places to ensure that all buffers
-// will be freed in the end.
+// insert the dealloc. However, the algorithm supports introducing copy buffers
+// and placing deallocs in safe locations to ensure that all buffers will be
+// freed in the end.
 //
 // TODO:
 // The current implementation does not support explicit-control-flow loops and
@@ -56,13 +52,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Dominance.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SetOperations.h"
 
@@ -95,134 +91,249 @@ static void getSuccessorRegions(RegionBranchOpInterface regionInterface,
   regionInterface.getSuccessorRegions(index, operandAttributes, successors);
 }
 
-namespace {
 //===----------------------------------------------------------------------===//
 // BufferPlacementAliasAnalysis
 //===----------------------------------------------------------------------===//
 
-/// A straight-forward alias analysis which ensures that all aliases of all
-/// values will be determined. This is a requirement for the BufferPlacement
-/// class since you need to determine safe positions to place alloc and
-/// deallocs.
-class BufferPlacementAliasAnalysis {
-public:
-  using ValueSetT = SmallPtrSet<Value, 16>;
-  using ValueMapT = llvm::DenseMap<Value, ValueSetT>;
-
-public:
-  /// Constructs a new alias analysis using the op provided.
-  BufferPlacementAliasAnalysis(Operation *op) { build(op); }
-
-  /// Find all immediate aliases this value could potentially have.
-  ValueMapT::const_iterator find(Value value) const {
-    return aliases.find(value);
-  }
-
-  /// Returns the end iterator that can be used in combination with find.
-  ValueMapT::const_iterator end() const { return aliases.end(); }
-
-  /// Find all immediate and indirect aliases this value could potentially
-  /// have. Note that the resulting set will also contain the value provided as
-  /// it is an alias of itself.
-  ValueSetT resolve(Value value) const {
-    ValueSetT result;
-    resolveRecursive(value, result);
-    return result;
-  }
+/// Constructs a new alias analysis using the op provided.
+BufferPlacementAliasAnalysis::BufferPlacementAliasAnalysis(Operation *op) {
+  build(op);
+}
 
-  /// Removes the given values from all alias sets.
-  void remove(const SmallPtrSetImpl<Value> &aliasValues) {
-    for (auto &entry : aliases)
-      llvm::set_subtract(entry.second, aliasValues);
-  }
+/// Find all immediate and indirect aliases this value could potentially
+/// have. Note that the resulting set will also contain the value provided as
+/// it is an alias of itself.
+BufferPlacementAliasAnalysis::ValueSetT
+BufferPlacementAliasAnalysis::resolve(Value value) const {
+  ValueSetT result;
 
-private:
   /// Recursively determines alias information for the given value. It stores
   /// all newly found potential aliases in the given result set.
-  void resolveRecursive(Value value, ValueSetT &result) const {
-    if (!result.insert(value).second)
+  std::function<void(Value)> resolveRecursive = [&](Value current) {
+    if (!result.insert(current).second)
       return;
-    auto it = aliases.find(value);
+    auto it = aliases.find(current);
     if (it == aliases.end())
       return;
     for (Value alias : it->second)
-      resolveRecursive(alias, result);
-  }
+      resolveRecursive(alias);
+  };
 
-  /// This function constructs a mapping from values to its immediate aliases.
-  /// It iterates over all blocks, gets their predecessors, determines the
-  /// values that will be passed to the corresponding block arguments and
-  /// inserts them into the underlying map. Furthermore, it wires successor
-  /// regions and branch-like return operations from nested regions.
-  void build(Operation *op) {
-    // Registers all aliases of the given values.
-    auto registerAliases = [&](auto values, auto aliases) {
-      for (auto entry : llvm::zip(values, aliases))
-        this->aliases[std::get<0>(entry)].insert(std::get<1>(entry));
-    };
+  resolveRecursive(value);
+  return result;
+}
 
-    // Add additional aliases created by view changes to the alias list.
-    op->walk([&](ViewLikeOpInterface viewInterface) {
-      aliases[viewInterface.getViewSource()].insert(
-          viewInterface.getOperation()->getResult(0));
-    });
+/// Removes the given values from all alias sets.
+void BufferPlacementAliasAnalysis::remove(
+    const SmallPtrSetImpl<Value> &aliasValues) {
+  for (auto &entry : aliases)
+    llvm::set_subtract(entry.second, aliasValues);
+}
 
-    // Query all branch interfaces to link block argument aliases.
-    op->walk([&](BranchOpInterface branchInterface) {
-      Block *parentBlock = branchInterface.getOperation()->getBlock();
-      for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end();
-           it != e; ++it) {
-        // Query the branch op interface to get the successor operands.
-        auto successorOperands =
-            branchInterface.getSuccessorOperands(it.getIndex());
-        if (!successorOperands.hasValue())
-          continue;
-        // Build the actual mapping of values to their immediate aliases.
-        registerAliases(successorOperands.getValue(), (*it)->getArguments());
-      }
-    });
+/// This function constructs a mapping from values to its immediate aliases.
+/// It iterates over all blocks, gets their predecessors, determines the
+/// values that will be passed to the corresponding block arguments and
+/// inserts them into the underlying map. Furthermore, it wires successor
+/// regions and branch-like return operations from nested regions.
+void BufferPlacementAliasAnalysis::build(Operation *op) {
+  // Registers all aliases of the given values.
+  auto registerAliases = [&](auto values, auto aliases) {
+    for (auto entry : llvm::zip(values, aliases))
+      this->aliases[std::get<0>(entry)].insert(std::get<1>(entry));
+  };
 
-    // Query the RegionBranchOpInterface to find potential successor regions.
-    op->walk([&](RegionBranchOpInterface regionInterface) {
-      // Extract all entry regions and wire all initial entry successor inputs.
-      SmallVector<RegionSuccessor, 2> entrySuccessors;
-      getSuccessorRegions(regionInterface, /*index=*/llvm::None,
-                          entrySuccessors);
-      for (RegionSuccessor &entrySuccessor : entrySuccessors) {
-        // Wire the entry region's successor arguments with the initial
-        // successor inputs.
-        assert(entrySuccessor.getSuccessor() &&
-               "Invalid entry region without an attached successor region");
-        registerAliases(regionInterface.getSuccessorEntryOperands(
-                            entrySuccessor.getSuccessor()->getRegionNumber()),
-                        entrySuccessor.getSuccessorInputs());
-      }
+  // Add additional aliases created by view changes to the alias list.
+  op->walk([&](ViewLikeOpInterface viewInterface) {
+    aliases[viewInterface.getViewSource()].insert(
+        viewInterface.getOperation()->getResult(0));
+  });
+
+  // Query all branch interfaces to link block argument aliases.
+  op->walk([&](BranchOpInterface branchInterface) {
+    Block *parentBlock = branchInterface.getOperation()->getBlock();
+    for (auto it = parentBlock->succ_begin(), e = parentBlock->succ_end();
+         it != e; ++it) {
+      // Query the branch op interface to get the successor operands.
+      auto successorOperands =
+          branchInterface.getSuccessorOperands(it.getIndex());
+      if (!successorOperands.hasValue())
+        continue;
+      // Build the actual mapping of values to their immediate aliases.
+      registerAliases(successorOperands.getValue(), (*it)->getArguments());
+    }
+  });
+
+  // Query the RegionBranchOpInterface to find potential successor regions.
+  op->walk([&](RegionBranchOpInterface regionInterface) {
+    // Extract all entry regions and wire all initial entry successor inputs.
+    SmallVector<RegionSuccessor, 2> entrySuccessors;
+    getSuccessorRegions(regionInterface, /*index=*/llvm::None, entrySuccessors);
+    for (RegionSuccessor &entrySuccessor : entrySuccessors) {
+      // Wire the entry region's successor arguments with the initial
+      // successor inputs.
+      assert(entrySuccessor.getSuccessor() &&
+             "Invalid entry region without an attached successor region");
+      registerAliases(regionInterface.getSuccessorEntryOperands(
+                          entrySuccessor.getSuccessor()->getRegionNumber()),
+                      entrySuccessor.getSuccessorInputs());
+    }
 
-      // Wire flow between regions and from region exits.
-      for (Region &region : regionInterface.getOperation()->getRegions()) {
-        // Iterate over all successor region entries that are reachable from the
-        // current region.
-        SmallVector<RegionSuccessor, 2> successorRegions;
-        getSuccessorRegions(regionInterface, region.getRegionNumber(),
-                            successorRegions);
-        for (RegionSuccessor &successorRegion : successorRegions) {
-          // Iterate over all immediate terminator operations and wire the
-          // successor inputs with the operands of each terminator.
-          walkReturnOperations(&region, [&](Operation *terminator) {
-            registerAliases(terminator->getOperands(),
-                            successorRegion.getSuccessorInputs());
-          });
-        }
+    // Wire flow between regions and from region exits.
+    for (Region &region : regionInterface.getOperation()->getRegions()) {
+      // Iterate over all successor region entries that are reachable from the
+      // current region.
+      SmallVector<RegionSuccessor, 2> successorRegions;
+      getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                          successorRegions);
+      for (RegionSuccessor &successorRegion : successorRegions) {
+        // Iterate over all immediate terminator operations and wire the
+        // successor inputs with the operands of each terminator.
+        walkReturnOperations(&region, [&](Operation *terminator) {
+          registerAliases(terminator->getOperands(),
+                          successorRegion.getSuccessorInputs());
+        });
       }
+    }
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// BufferPlacementAllocs
+//===----------------------------------------------------------------------===//
+
+/// Get the start operation to place the given alloc value withing the
+// specified placement block.
+Operation *BufferPlacementAllocs::getStartOperation(Value allocValue,
+                                                    Block *placementBlock,
+                                                    const Liveness &liveness) {
+  // We have to ensure that we place the alloc before its first use in this
+  // block.
+  const LivenessBlockInfo &livenessInfo = *liveness.getLiveness(placementBlock);
+  Operation *startOperation = livenessInfo.getStartOperation(allocValue);
+  // Check whether the start operation lies in the desired placement block.
+  // If not, we will use the terminator as this is the last operation in
+  // this block.
+  if (startOperation->getBlock() != placementBlock) {
+    Operation *opInPlacementBlock =
+        placementBlock->findAncestorOpInBlock(*startOperation);
+    startOperation = opInPlacementBlock ? opInPlacementBlock
+                                        : placementBlock->getTerminator();
+  }
+
+  return startOperation;
+}
+
+/// Finds associated deallocs that can be linked to our allocation nodes (if
+/// any).
+Operation *BufferPlacementAllocs::findDealloc(Value allocValue) {
+  auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) {
+    auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
+    if (!effectInterface)
+      return false;
+    // Try to find a free effect that is applied to one of our values
+    // that will be automatically freed by our pass.
+    SmallVector<MemoryEffects::EffectInstance, 2> effects;
+    effectInterface.getEffectsOnValue(allocValue, effects);
+    return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) {
+      return isa<MemoryEffects::Free>(it.getEffect());
     });
+  });
+  // Assign the associated dealloc operation (if any).
+  return userIt != allocValue.user_end() ? *userIt : nullptr;
+}
+
+/// Initializes the internal list by discovering all supported allocation
+/// nodes.
+BufferPlacementAllocs::BufferPlacementAllocs(Operation *op) { build(op); }
+
+/// Searches for and registers all supported allocation entries.
+void BufferPlacementAllocs::build(Operation *op) {
+  op->walk([&](MemoryEffectOpInterface opInterface) {
+    // Try to find a single allocation result.
+    SmallVector<MemoryEffects::EffectInstance, 2> effects;
+    opInterface.getEffects(effects);
+
+    SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
+    llvm::copy_if(
+        effects, std::back_inserter(allocateResultEffects),
+        [=](MemoryEffects::EffectInstance &it) {
+          Value value = it.getValue();
+          return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
+                 value.isa<OpResult>() &&
+                 it.getResource() !=
+                     SideEffects::AutomaticAllocationScopeResource::get();
+        });
+    // If there is one result only, we will be able to move the allocation and
+    // (possibly existing) deallocation ops.
+    if (allocateResultEffects.size() != 1)
+      return;
+    // Get allocation result.
+    Value allocValue = allocateResultEffects[0].getValue();
+    // Find the associated dealloc value and register the allocation entry.
+    allocs.push_back(std::make_tuple(allocValue, findDealloc(allocValue)));
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// BufferPlacementTransformationBase
+//===----------------------------------------------------------------------===//
+
+/// Constructs a new transformation base using the given root operation.
+BufferPlacementTransformationBase::BufferPlacementTransformationBase(
+    Operation *op)
+    : aliases(op), allocs(op), liveness(op) {}
+
+/// Returns true if the given operation represents a loop by testing whether it
+/// implements the `LoopLikeOpInterface` or the `RegionBranchOpInterface`. In
+/// the case of a `RegionBranchOpInterface`, it checks all region-based control-
+/// flow edges for cycles.
+bool BufferPlacementTransformationBase::isLoop(Operation *op) {
+  // If the operation implements the `LoopLikeOpInterface` it can be considered
+  // a loop.
+  if (isa<LoopLikeOpInterface>(op))
+    return true;
+
+  // If the operation does not implement the `RegionBranchOpInterface`, it is
+  // (currently) not possible to detect a loop.
+  RegionBranchOpInterface regionInterface;
+  if (!(regionInterface = dyn_cast<RegionBranchOpInterface>(op)))
+    return false;
+
+  // Recurses into a region using the current region interface to find potential
+  // cycles.
+  SmallPtrSet<Region *, 4> visitedRegions;
+  std::function<bool(Region *)> recurse = [&](Region *current) {
+    if (!current)
+      return false;
+    // If we have found a back edge, the parent operation induces a loop.
+    if (!visitedRegions.insert(current).second)
+      return true;
+    // Recurses into all region successors.
+    SmallVector<RegionSuccessor, 2> successors;
+    getSuccessorRegions(regionInterface, current->getRegionNumber(),
+                        successors);
+    for (RegionSuccessor &regionEntry : successors)
+      if (recurse(regionEntry.getSuccessor()))
+        return true;
+    return false;
+  };
+
+  // Start with all entry regions and test whether they induce a loop.
+  SmallVector<RegionSuccessor, 2> successorRegions;
+  getSuccessorRegions(regionInterface, /*index=*/llvm::None, successorRegions);
+  for (RegionSuccessor &regionEntry : successorRegions) {
+    if (recurse(regionEntry.getSuccessor()))
+      return true;
+    visitedRegions.clear();
   }
 
-  /// Maps values to all immediate aliases this value can have.
-  ValueMapT aliases;
-};
+  return false;
+}
+
+namespace {
 
 //===----------------------------------------------------------------------===//
-// Backedges
+// Backedges analysis
 //===----------------------------------------------------------------------===//
 
 /// A straight-forward program analysis which detects loop backedges induced by
@@ -284,7 +395,7 @@ class Backedges {
       return;
 
     // Recurse into all operations and successor blocks.
-    for (auto &op : block.getOperations())
+    for (Operation &op : block.getOperations())
       recurse(&op, predecessor);
 
     // Leave the current block.
@@ -299,133 +410,28 @@ class Backedges {
 };
 
 //===----------------------------------------------------------------------===//
-// BufferPlacement
+// BufferDeallocation
 //===----------------------------------------------------------------------===//
 
-// The main buffer placement analysis used to place allocs, copies and deallocs.
-class BufferPlacement {
-public:
-  using ValueSetT = BufferPlacementAliasAnalysis::ValueSetT;
-
-  /// An intermediate representation of a single allocation node.
-  struct AllocEntry {
-    /// A reference to the associated allocation node.
-    Value allocValue;
-
-    /// The associated placement block in which the allocation should be
-    /// performed.
-    Block *placementBlock;
-
-    /// The associated dealloc operation (if any).
-    Operation *deallocOperation;
-  };
-
-  using AllocEntryList = SmallVector<AllocEntry, 8>;
-
+/// The buffer deallocation transformation which ensures that all allocs in the
+/// program have a corresponding de-allocation. As a side-effect, it might also
+/// introduce copies that in turn leads to additional allocs and de-allocations.
+class BufferDeallocation : BufferPlacementTransformationBase {
 public:
-  BufferPlacement(Operation *op)
-      : operation(op), aliases(op), liveness(op), dominators(op),
-        postDominators(op) {
-    // Gather all allocation nodes
-    initBlockMapping();
-  }
+  BufferDeallocation(Operation *op)
+      : BufferPlacementTransformationBase(op), dominators(op),
+        postDominators(op) {}
 
-  /// Performs the actual placement/creation of all alloc, copy and dealloc
-  /// nodes.
-  void place() {
-    // Place all allocations.
-    placeAllocs();
+  /// Performs the actual placement/creation of all temporary alloc, copy and
+  /// dealloc nodes.
+  void deallocate() {
     // Add additional allocations and copies that are required.
     introduceCopies();
-    // Find all associated dealloc nodes.
-    findDeallocs();
     // Place deallocations for all allocation entries.
     placeDeallocs();
   }
 
 private:
-  /// Initializes the internal block mapping by discovering allocation nodes. It
-  /// maps all allocation nodes to their initial block in which they can be
-  /// safely allocated.
-  void initBlockMapping() {
-    operation->walk([&](MemoryEffectOpInterface opInterface) {
-      // Try to find a single allocation result.
-      SmallVector<MemoryEffects::EffectInstance, 2> effects;
-      opInterface.getEffects(effects);
-
-      SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
-      llvm::copy_if(
-          effects, std::back_inserter(allocateResultEffects),
-          [=](MemoryEffects::EffectInstance &it) {
-            Value value = it.getValue();
-            return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
-                   value.isa<OpResult>() &&
-                   it.getResource() !=
-                       SideEffects::AutomaticAllocationScopeResource::get();
-          });
-      // If there is one result only, we will be able to move the allocation and
-      // (possibly existing) deallocation ops.
-      if (allocateResultEffects.size() != 1)
-        return;
-      // Get allocation result.
-      auto allocResult = allocateResultEffects[0].getValue().cast<OpResult>();
-      // Find the initial allocation block and register this result.
-      allocs.push_back(
-          {allocResult, getInitialAllocBlock(allocResult), nullptr});
-    });
-  }
-
-  /// Computes a valid allocation position in a dominator (if possible) for the
-  /// given allocation result.
-  Block *getInitialAllocBlock(OpResult result) {
-    // Get all allocation operands as these operands are important for the
-    // allocation operation.
-    Operation *owner = result.getOwner();
-    auto operands = owner->getOperands();
-    Block *dominator;
-    if (operands.size() < 1)
-      dominator =
-          findCommonDominator(result, aliases.resolve(result), dominators);
-    else {
-      // If this node has dependencies, check all dependent nodes with respect
-      // to a common post dominator in which all values are available.
-      ValueSetT dependencies(++operands.begin(), operands.end());
-      dominator =
-          findCommonDominator(*operands.begin(), dependencies, postDominators);
-    }
-
-    // Do not move allocs out of their parent regions to keep them local.
-    if (dominator->getParent() != owner->getParentRegion())
-      return &owner->getParentRegion()->front();
-    return dominator;
-  }
-
-  /// Finds correct alloc positions according to the algorithm described at
-  /// the top of the file for all alloc nodes that can be handled by this
-  /// analysis.
-  void placeAllocs() const {
-    for (const AllocEntry &entry : allocs) {
-      Value alloc = entry.allocValue;
-      // Get the actual block to place the alloc and get liveness information
-      // for the placement block.
-      Block *placementBlock = entry.placementBlock;
-      // We have to ensure that we place the alloc before its first use in this
-      // block.
-      const LivenessBlockInfo *livenessInfo =
-          liveness.getLiveness(placementBlock);
-      Operation *startOperation = livenessInfo->getStartOperation(alloc);
-      // Check whether the start operation lies in the desired placement block.
-      // If not, we will use the terminator as this is the last operation in
-      // this block.
-      if (startOperation->getBlock() != placementBlock)
-        startOperation = placementBlock->getTerminator();
-
-      // Move the alloc in front of the start operation.
-      Operation *allocOperation = alloc.getDefiningOp();
-      allocOperation->moveBefore(startOperation);
-    }
-  }
-
   /// Introduces required allocs and copy operations to avoid memory leaks.
   void introduceCopies() {
     // Initialize the set of values that require a dedicated memory free
@@ -462,9 +468,10 @@ class BufferPlacement {
     };
 
     // Detect possibly unsafe aliases starting from all allocations.
-    for (auto &entry : allocs)
-      findUnsafeValues(entry.allocValue, entry.placementBlock);
-
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value allocValue = std::get<0>(entry);
+      findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock());
+    }
     // Try to find block arguments that require an explicit free operation
     // until we reach a fix point.
     while (!toProcess.empty()) {
@@ -486,7 +493,7 @@ class BufferPlacement {
       // Register the value to require a final dealloc. Note that we do not have
       // to assign a block here since we do not want to move the allocation node
       // to another location.
-      allocs.push_back({value, nullptr, nullptr});
+      allocs.registerAlloc(std::make_tuple(value, nullptr));
     }
   }
 
@@ -542,7 +549,8 @@ class BufferPlacement {
     // parent operation. In this case, we have to introduce an additional copy
     // for buffer that is passed to the argument.
     SmallVector<RegionSuccessor, 2> successorRegions;
-    getSuccessorRegions(regionInterface, llvm::None, successorRegions);
+    getSuccessorRegions(regionInterface, /*index=*/llvm::None,
+                        successorRegions);
     auto *it =
         llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) {
           return successorRegion.getSuccessor() == argRegion;
@@ -664,30 +672,6 @@ class BufferPlacement {
     return alloc;
   }
 
-  /// Finds associated deallocs that can be linked to our allocation nodes (if
-  /// any).
-  void findDeallocs() {
-    for (auto &entry : allocs) {
-      auto userIt =
-          llvm::find_if(entry.allocValue.getUsers(), [&](Operation *user) {
-            auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
-            if (!effectInterface)
-              return false;
-            // Try to find a free effect that is applied to one of our values
-            // that will be automatically freed by our pass.
-            SmallVector<MemoryEffects::EffectInstance, 2> effects;
-            effectInterface.getEffectsOnValue(entry.allocValue, effects);
-            return llvm::any_of(
-                effects, [&](MemoryEffects::EffectInstance &it) {
-                  return isa<MemoryEffects::Free>(it.getEffect());
-                });
-          });
-      // Assign the associated dealloc operation (if any).
-      if (userIt != entry.allocValue.user_end())
-        entry.deallocOperation = *userIt;
-    }
-  }
-
   /// Finds correct dealloc positions according to the algorithm described at
   /// the top of the file for all alloc nodes and block arguments that can be
   /// handled by this analysis.
@@ -696,8 +680,8 @@ class BufferPlacement {
     // These deallocations will be linked to their associated allocation nodes
     // since they don't have any aliases that can (potentially) increase their
     // liveness.
-    for (const AllocEntry &entry : allocs) {
-      Value alloc = entry.allocValue;
+    for (const BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value alloc = std::get<0>(entry);
       auto aliasesSet = aliases.resolve(alloc);
       assert(aliasesSet.size() > 0 && "must contain at least one alias");
 
@@ -713,11 +697,21 @@ class BufferPlacement {
       // the placementBlock and that we can safely place the dealloc at the
       // beginning.
       Operation *endOperation = &placementBlock->front();
+
       // Iterate over all aliases and ensure that the endOperation will point
       // to the last operation of all potential aliases in the placementBlock.
       for (Value alias : aliasesSet) {
+        // Ensure that the start operation is at least the defining operation of
+        // the current alias to avoid invalid placement of deallocs for aliases
+        // without any uses.
+        Operation *beforeOp = endOperation;
+        if (alias.getDefiningOp() &&
+            !(beforeOp = placementBlock->findAncestorOpInBlock(
+                  *alias.getDefiningOp())))
+          continue;
+
         Operation *aliasEndOperation =
-            livenessInfo->getEndOperation(alias, endOperation);
+            livenessInfo->getEndOperation(alias, beforeOp);
         // Check whether the aliasEndOperation lies in the desired block and
         // whether it is behind the current endOperation. If yes, this will be
         // the new endOperation.
@@ -729,8 +723,9 @@ class BufferPlacement {
       // the dealloc taking all potential aliases into account.
 
       // If there is an existing dealloc, move it to the right place.
-      if (entry.deallocOperation) {
-        entry.deallocOperation->moveAfter(endOperation);
+      Operation *deallocOperation = std::get<1>(entry);
+      if (deallocOperation) {
+        deallocOperation->moveAfter(endOperation);
       } else {
         // If the Dealloc position is at the terminator operation of the
         // block, then the value should escape from a deallocation.
@@ -744,56 +739,26 @@ class BufferPlacement {
     }
   }
 
-  /// Finds a common dominator for the given value while taking the positions
-  /// of the values in the value set into account. It supports dominator and
-  /// post-dominator analyses via template arguments.
-  template <typename DominatorT>
-  Block *findCommonDominator(Value value, const ValueSetT &values,
-                             const DominatorT &doms) const {
-    // Start with the current block the value is defined in.
-    Block *dom = value.getParentBlock();
-    // Iterate over all aliases and their uses to find a safe placement block
-    // according to the given dominator information.
-    for (Value childValue : values)
-      for (Operation *user : childValue.getUsers()) {
-        // Move upwards in the dominator tree to find an appropriate
-        // dominator block that takes the current use into account.
-        dom = doms.findNearestCommonDominator(dom, user->getBlock());
-      }
-    return dom;
-  }
-
-  /// The operation this transformation was constructed from.
-  Operation *operation;
-
-  /// Alias information that can be updated during the insertion of copies.
-  BufferPlacementAliasAnalysis aliases;
-
-  /// Maps allocation nodes to their associated blocks.
-  AllocEntryList allocs;
-
-  // Stores already copied allocations to avoid additional copies of copies.
-  ValueSetT copiedValues;
-
-  /// The underlying liveness analysis to compute fine grained information
-  /// about alloc and dealloc positions.
-  Liveness liveness;
-
-  /// The dominator analysis to place deallocs in the appropriate blocks.
+  /// The dominator info to find the appropriate start operation to move the
+  /// allocs.
   DominanceInfo dominators;
 
-  /// The post dominator analysis to place deallocs in the appropriate blocks.
+  /// The post dominator info to move the dependent allocs in the right
+  /// position.
   PostDominanceInfo postDominators;
+
+  /// Stores already copied allocations to avoid additional copies of copies.
+  ValueSetT copiedValues;
 };
 
 //===----------------------------------------------------------------------===//
-// BufferPlacementPass
+// BufferDeallocationPass
 //===----------------------------------------------------------------------===//
 
-/// The actual buffer placement pass that moves alloc and dealloc nodes into
-/// the right positions. It uses the algorithm described at the top of the
-/// file.
-struct BufferPlacementPass : BufferPlacementBase<BufferPlacementPass> {
+/// The actual buffer deallocation pass that inserts and moves dealloc nodes
+/// into the right positions. Furthermore, it inserts additional allocs and
+/// copies if necessary. It uses the algorithm described at the top of the file.
+struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
 
   void runOnFunction() override {
     // Ensure that there are supported loops only.
@@ -804,18 +769,18 @@ struct BufferPlacementPass : BufferPlacementBase<BufferPlacementPass> {
       return;
     }
 
-    // Place all required alloc, copy and dealloc nodes.
-    BufferPlacement placement(getFunction());
-    placement.place();
+    // Place all required temporary alloc, copy and dealloc nodes.
+    BufferDeallocation deallocation(getFunction());
+    deallocation.deallocate();
   }
 };
 
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
-// BufferPlacementPass construction
+// BufferDeallocationPass construction
 //===----------------------------------------------------------------------===//
 
-std::unique_ptr<Pass> mlir::createBufferPlacementPass() {
-  return std::make_unique<BufferPlacementPass>();
+std::unique_ptr<Pass> mlir::createBufferDeallocationPass() {
+  return std::make_unique<BufferDeallocationPass>();
 }

diff  --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp
new file mode 100644
index 000000000000..2686d35ee323
--- /dev/null
+++ b/mlir/lib/Transforms/BufferOptimizations.cpp
@@ -0,0 +1,258 @@
+//===- BufferOptimizations.cpp - pre-pass optimizations for bufferization -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for two optimization passes. These passes try to
+// hoist alloc nodes to reduce the number of allocations and copies during
+// buffer deallocation.
+
+#include "PassDetail.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Bufferize.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BufferAllocationHoisting
+//===----------------------------------------------------------------------===//
+
+/// A base implementation compatible with the `BufferAllocationHoisting` class.
+struct BufferAllocationHoistingStateBase {
+  /// A pointer to the current dominance info.
+  DominanceInfo *dominators;
+
+  /// The current allocation value.
+  Value allocValue;
+
+  /// The current placement block (if any).
+  Block *placementBlock;
+
+  /// Initializes the state base.
+  BufferAllocationHoistingStateBase(DominanceInfo *dominators, Value allocValue,
+                                    Block *placementBlock)
+      : dominators(dominators), allocValue(allocValue),
+        placementBlock(placementBlock) {}
+};
+
+/// Implements the actual hoisting logic for allocation nodes.
+template <typename StateT>
+class BufferAllocationHoisting : public BufferPlacementTransformationBase {
+private:
+  /// Returns true if the given operation implements a known high-level region-
+  /// based control-flow interface.
+  static bool isKnownControlFlowInterface(Operation *op) {
+    return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
+  }
+
+public:
+  BufferAllocationHoisting(Operation *op)
+      : BufferPlacementTransformationBase(op), dominators(op),
+        postDominators(op) {}
+
+  /// Moves allocations upwards.
+  void hoist() {
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value allocValue = std::get<0>(entry);
+      Operation *definingOp = allocValue.getDefiningOp();
+      assert(definingOp && "No defining op");
+      auto operands = definingOp->getOperands();
+      auto resultAliases = aliases.resolve(allocValue);
+      // Determine the common dominator block of all aliases.
+      Block *dominatorBlock =
+          findCommonDominator(allocValue, resultAliases, dominators);
+      // Init the initial hoisting state.
+      StateT state(&dominators, allocValue, allocValue.getParentBlock());
+      // Check for additional allocation dependencies to compute an upper bound
+      // for hoisting.
+      Block *dependencyBlock = nullptr;
+      if (!operands.empty()) {
+        // If this node has dependencies, check all dependent nodes with respect
+        // to a common post dominator. This ensures that all dependency values
+        // have been computed before allocating the buffer.
+        ValueSetT dependencies(std::next(operands.begin()), operands.end());
+        dependencyBlock = findCommonDominator(*operands.begin(), dependencies,
+                                              postDominators);
+      }
+
+      // Find the actual placement block and determine the start operation using
+      // an upper placement-block boundary. The idea is that placement block
+      // cannot be moved any further upwards than the given upper bound.
+      Block *placementBlock = findPlacementBlock(
+          state, state.computeUpperBound(dominatorBlock, dependencyBlock));
+      Operation *startOperation = BufferPlacementAllocs::getStartOperation(
+          allocValue, placementBlock, liveness);
+
+      // Move the alloc in front of the start operation.
+      Operation *allocOperation = allocValue.getDefiningOp();
+      allocOperation->moveBefore(startOperation);
+    }
+  }
+
+private:
+  /// Finds a valid placement block by walking upwards in the CFG until we
+  /// either cannot continue our walk due to constraints (given by the StateT
+  /// implementation) or we have reached the upper-most dominator block.
+  Block *findPlacementBlock(StateT &state, Block *upperBound) {
+    Block *currentBlock = state.placementBlock;
+    // Walk from the innermost regions/loops to the outermost regions/loops and
+    // find an appropriate placement block that satisfies the constraint of the
+    // current StateT implementation. Walk until we reach the upperBound block
+    // (if any).
+
+    // If we are not able to find a valid parent operation or an associated
+    // parent block, break the walk loop.
+    Operation *parentOp;
+    Block *parentBlock;
+    while ((parentOp = currentBlock->getParentOp()) &&
+           (parentBlock = parentOp->getBlock()) &&
+           (!upperBound ||
+            dominators.properlyDominates(upperBound, currentBlock))) {
+      // Try to find an immediate dominator and check whether the parent block
+      // is above the immediate dominator (if any).
+      DominanceInfoNode *idom = dominators.getNode(currentBlock)->getIDom();
+      if (idom && dominators.properlyDominates(parentBlock, idom->getBlock())) {
+        // If the current immediate dominator is below the placement block, move
+        // to the immediate dominator block.
+        currentBlock = idom->getBlock();
+        state.recordMoveToDominator(currentBlock);
+      } else {
+        // We have to move to our parent block since an immediate dominator does
+        // either not exist or is above our parent block. If we cannot move to
+        // our parent operation due to constraints given by the StateT
+        // implementation, break the walk loop. Furthermore, we should not move
+        // allocations out of unknown region-based control-flow operations.
+        if (!isKnownControlFlowInterface(parentOp) ||
+            !state.isLegalPlacement(parentOp))
+          break;
+        // Move to our parent block by notifying the current StateT
+        // implementation.
+        currentBlock = parentBlock;
+        state.recordMoveToParent(currentBlock);
+      }
+    }
+    // Return the finally determined placement block.
+    return state.placementBlock;
+  }
+
+  /// The dominator info to find the appropriate start operation to move the
+  /// allocs.
+  DominanceInfo dominators;
+
+  /// The post dominator info to move the dependent allocs in the right
+  /// position.
+  PostDominanceInfo postDominators;
+
+  /// The map storing the final placement blocks of a given alloc value.
+  llvm::DenseMap<Value, Block *> placementBlocks;
+};
+
+/// A state implementation compatible with the `BufferAllocationHoisting` class
+/// that hoists allocations into dominator blocks while keeping them inside of
+/// loops.
+struct BufferAllocationHoistingState : BufferAllocationHoistingStateBase {
+  using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase;
+
+  /// Computes the upper bound for the placement block search.
+  Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) {
+    // If we do not have a dependency block, the upper bound is given by the
+    // dominator block.
+    if (!dependencyBlock)
+      return dominatorBlock;
+
+    // Find the "lower" block of the dominator and the dependency block to
+    // ensure that we do not move allocations above this block.
+    return dominators->properlyDominates(dominatorBlock, dependencyBlock)
+               ? dependencyBlock
+               : dominatorBlock;
+  }
+
+  /// Returns true if the given operation does not represent a loop.
+  bool isLegalPlacement(Operation *op) {
+    return !BufferPlacementTransformationBase::isLoop(op);
+  }
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToDominator(Block *block) { placementBlock = block; }
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToParent(Block *block) { recordMoveToDominator(block); }
+};
+
+/// A state implementation compatible with the `BufferAllocationHoisting` class
+/// that hoists allocations out of loops.
+struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
+  using BufferAllocationHoistingStateBase::BufferAllocationHoistingStateBase;
+
+  /// Remembers the dominator block of all aliases.
+  Block *aliasDominatorBlock;
+
+  /// Computes the upper bound for the placement block search.
+  Block *computeUpperBound(Block *dominatorBlock, Block *dependencyBlock) {
+    aliasDominatorBlock = dominatorBlock;
+    // If there is a dependency block, we have to use this block as an upper
+    // bound to satisfy all allocation value dependencies.
+    return dependencyBlock ? dependencyBlock : nullptr;
+  }
+
+  /// Returns true if the given operation represents a loop and one of the
+  /// aliases caused the `aliasDominatorBlock` to be "above" the block of the
+  /// given loop operation. If this is the case, it indicates that the
+  /// allocation is passed via a back edge.
+  bool isLegalPlacement(Operation *op) {
+    return BufferPlacementTransformationBase::isLoop(op) &&
+           !dominators->dominates(aliasDominatorBlock, op->getBlock());
+  }
+
+  /// Does not change the internal placement block, as we want to move
+  /// operations out of loops only.
+  void recordMoveToDominator(Block *block) {}
+
+  /// Sets the current placement block to the given block.
+  void recordMoveToParent(Block *block) { placementBlock = block; }
+};
+
+//===----------------------------------------------------------------------===//
+// BufferOptimizationPasses
+//===----------------------------------------------------------------------===//
+
+/// The buffer hoisting pass that hoists allocation nodes into dominating
+/// blocks.
+struct BufferHoistingPass : BufferHoistingBase<BufferHoistingPass> {
+
+  void runOnFunction() override {
+    // Hoist all allocations into dominator blocks.
+    BufferAllocationHoisting<BufferAllocationHoistingState> optimizer(
+        getFunction());
+    optimizer.hoist();
+  }
+};
+
+/// The buffer loop hoisting pass that hoists allocation nodes out of loops.
+struct BufferLoopHoistingPass : BufferLoopHoistingBase<BufferLoopHoistingPass> {
+
+  void runOnFunction() override {
+    // Hoist all allocations out of loops.
+    BufferAllocationHoisting<BufferAllocationLoopHoistingState> optimizer(
+        getFunction());
+    optimizer.hoist();
+  }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
+  return std::make_unique<BufferHoistingPass>();
+}
+
+std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {
+  return std::make_unique<BufferLoopHoistingPass>();
+}

diff  --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 927673fcbb4b..a2b96b90b543 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_subdirectory(Utils)
 
 add_mlir_library(MLIRTransforms
-  BufferPlacement.cpp
+  BufferDeallocation.cpp
+  BufferOptimizations.cpp
   Bufferize.cpp
   Canonicalizer.cpp
   CopyRemoval.cpp

diff  --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index ed78365b1937..f0b7c3d5b071 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -linalg-bufferize -buffer-placement -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -linalg-bufferize -buffer-hoisting -buffer-deallocation -split-input-file %s | FileCheck %s
 
 #map0 = affine_map<(d0) -> (d0)>
 

diff  --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-deallocation.mlir
similarity index 78%
rename from mlir/test/Transforms/buffer-placement.mlir
rename to mlir/test/Transforms/buffer-deallocation.mlir
index d8b7de3bde84..709ee4f27663 100644
--- a/mlir/test/Transforms/buffer-placement.mlir
+++ b/mlir/test/Transforms/buffer-deallocation.mlir
@@ -1,8 +1,8 @@
-// RUN: mlir-opt -buffer-placement -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -buffer-deallocation -split-input-file %s | FileCheck %s
 
-// This file checks the behaviour of BufferPlacement pass for moving Alloc and
-// Dealloc operations and inserting the missing the DeallocOps in their correct
-// positions.
+// This file checks the behaviour of BufferDeallocation pass for moving and
+// inserting missing DeallocOps in their correct positions. Furthermore,
+// copies and their corresponding AllocOps are inserted.
 
 // Test Case:
 //    bb0
@@ -10,9 +10,10 @@
 //  bb1  bb2 <- Initial position of AllocOp
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It should move the existing AllocOp to
-// the entry block, and insert a DeallocOp at the exit block after CopyOp since
-// %1 is an alias for %0 and %arg1.
+// BufferDeallocation expected behavior: bb2 contains an AllocOp which is
+// passed to bb3. In the latter block, there should be an deallocation.
+// Since bb1 does not contain an adequate alloc and the alloc in bb2 is not
+// moved to bb0, we need to insert allocs and copies.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -38,10 +39,18 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   return
 }
 
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
 // CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: br ^bb3(%[[ALLOC0]]
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC1]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC2]]
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
 // -----
@@ -52,12 +61,12 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 //  bb1  bb2 <- Initial position of AllocOp
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It should not move the existing AllocOp
-// to any other block since the alloc has a dynamic dependency to block argument
-// %0 in bb2. Since the dynamic type is passed to bb3 via the block argument %2,
-// it is currently required to allocate a temporary buffer for %2 that gets
-// copies of %arg0 and %1 with their appropriate shape dimensions. The copy
-// buffer deallocation will be applied to %2 in block bb3.
+// BufferDeallocation expected behavior: The existing AllocOp has a dynamic
+// dependency to block argument %0 in bb2. Since the dynamic type is passed
+// to bb3 via the block argument %2, it is currently required to allocate a
+// temporary buffer for %2 that gets copies of %arg0 and %1 with their
+// appropriate shape dimensions. The copy buffer deallocation will be applied
+// to %2 in block bb3.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -91,6 +100,7 @@ func @condBranchDynamicType(
 //      CHECK: %[[DIM0:.*]] = dim
 // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]])
 // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
+// CHECK-NEXT: br ^bb3(%[[ALLOC0]]
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]])
 // CHECK-NEXT: linalg.generic
@@ -118,18 +128,17 @@ func @condBranchDynamicType(
 //       bb6
 //        |
 //       bb7
-// BufferPlacement Expected Behaviour: It should not move the existing AllocOp
-// to any other block since the alloc has a dynamic dependency to block argument
-// %0 in bb2. Since the dynamic type is passed to bb5 via the block argument %2
-// and to bb6 via block argument %3, it is currently required to allocate
-// temporary buffers for %2 and %3 that gets copies of %1 and %arg0 1 with their
-// appropriate shape dimensions. The copy buffer deallocations will be applied
-// to %2 in block bb5 and to %3 in block bb6. Furthermore, there should be no
-// copy inserted for %4.
+// BufferDeallocation expected behavior: The existing AllocOp has a dynamic
+// dependency to block argument %0 in bb2. Since the dynamic type is passed to
+// bb5 via the block argument %2 and to bb6 via block argument %3, it is
+// currently required to allocate temporary buffers for %2 and %3 that gets
+// copies of %1 and %arg0 1 with their appropriate shape dimensions. The copy
+// buffer deallocations will be applied to %2 in block bb5 and to %3 in block
+// bb6. Furthermore, there should be no copy inserted for %4.
 
 #map0 = affine_map<(d0) -> (d0)>
 
-// CHECK-LABEL: func @condBranchDynamicType
+// CHECK-LABEL: func @condBranchDynamicTypeNested
 func @condBranchDynamicTypeNested(
   %arg0: i1,
   %arg1: memref<?xf32>,
@@ -168,6 +177,7 @@ func @condBranchDynamicTypeNested(
 //      CHECK: %[[DIM0:.*]] = dim
 // CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[DIM0]])
 // CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
+// CHECK-NEXT: br ^bb6
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%[[IDX]])
 // CHECK-NEXT: linalg.generic
@@ -192,8 +202,8 @@ func @condBranchDynamicTypeNested(
 // -----
 
 // Test Case: Existing AllocOp with no users.
-// BufferPlacement Expected Behaviour: It should insert a DeallocOp right before
-// ReturnOp.
+// BufferDeallocation expected behavior: It should insert a DeallocOp right
+// before ReturnOp.
 
 // CHECK-LABEL: func @emptyUsesValue
 func @emptyUsesValue(%arg0: memref<4xf32>) {
@@ -212,9 +222,9 @@ func @emptyUsesValue(%arg0: memref<4xf32>) {
 //  |    bb1 <- Initial position of AllocOp
 //   \   /
 //    bb2
-// BufferPlacement Expected Behaviour: It should move the existing AllocOp to
-// the entry block and insert a DeallocOp at the exit block after CopyOp since
-// %1 is an alias for %0 and %arg1.
+// BufferDeallocation expected behavior: It should insert a DeallocOp at the
+// exit block after CopyOp since %1 is an alias for %0 and %arg1. Furthermore,
+// we have to insert a copy and an alloc in the beginning of the function.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -238,10 +248,16 @@ func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   return
 }
 
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
 // CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC1]]
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
+// CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
 // -----
@@ -252,9 +268,8 @@ func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 //  |    bb1
 //   \   /
 //    bb2
-// BufferPlacement Expected Behaviour: It shouldn't move the alloc position. It
-// only inserts a DeallocOp at the exit block after CopyOp since %1 is an alias
-// for %0 and %arg1.
+// BufferDeallocation expected behavior: It only inserts a DeallocOp at the
+// exit block after CopyOp since %1 is an alias for %0 and %arg1.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -289,10 +304,10 @@ func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 //  bb1  bb2
 //   \   /
 //    bb3 <- Initial position of the second AllocOp
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only
-// inserts two missing DeallocOps in the exit block. %5 is an alias for %0.
-// Therefore, the DeallocOp for %0 should occur after the last GenericOp. The
-// Dealloc for %7 should happen after the CopyOp.
+// BufferDeallocation expected behavior: It only inserts two missing
+// DeallocOps in the exit block. %5 is an alias for %0. Therefore, the
+// DeallocOp for %0 should occur after the last GenericOp. The Dealloc for %7
+// should happen after the CopyOp.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -347,9 +362,8 @@ func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 //  bb1  bb2
 //   \   /
 //    bb3
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOp. It only
-// inserts a missing DeallocOp in the exit block since %5 or %6 are the latest
-// aliases of %0.
+// BufferDeallocation expected behavior: It only inserts a missing DeallocOp
+// in the exit block since %5 or %6 are the latest aliases of %0.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -378,7 +392,8 @@ func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 }
 
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-//      CHECK: dealloc %[[FIRST_ALLOC]]
+//      CHECK: linalg.copy
+// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
 // CHECK-NEXT: return
 
 // -----
@@ -392,8 +407,8 @@ func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 //    \     \  /
 //     \     /
 //       bb5 <- Initial position of the second AllocOp
-// BufferPlacement Expected Behaviour: AllocOps shouldn't be moved.
-// Two missing DeallocOps should be inserted in the exit block.
+// BufferDeallocation expected behavior: Two missing DeallocOps should be
+// inserted in the exit block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -447,8 +462,8 @@ func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 // -----
 
 // Test Case: Dead operations in a single block.
-// BufferPlacement Expected Behaviour: It shouldn't move the AllocOps. It only
-// inserts the two missing DeallocOps after the last GenericOp.
+// BufferDeallocation expected behavior: It only inserts the two missing
+// DeallocOps after the last GenericOp.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -481,7 +496,8 @@ func @redundantOperations(%arg0: memref<2xf32>) {
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
 // CHECK-NEXT: linalg.generic {{.*}} ins(%[[ARG0]]{{.*}}outs(%[[FIRST_ALLOC]]
 //      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: linalg.generic {{.*}} ins(%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]]
+// CHECK-NEXT: linalg.generic {{.*}} ins
+// CHECK-SAME: (%[[FIRST_ALLOC]]{{.*}}outs(%[[SECOND_ALLOC]]
 //      CHECK: dealloc
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
@@ -494,9 +510,10 @@ func @redundantOperations(%arg0: memref<2xf32>) {
 // Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
 //                                    \   /
 //                                     bb3
-// BufferPlacement Expected Behaviour: Both AllocOps should be moved to the
-// entry block. Both missing DeallocOps should be moved to the exit block after
-// CopyOp since %arg2 is an alias for %0 and %1.
+// BufferDeallocation expected behavior: We need to introduce a copy for each
+// buffer since the buffers are passed to bb3. The both missing DeallocOps are
+// inserted in the respective block of the allocs. The copy is freed in the exit
+// block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -535,11 +552,25 @@ func @moving_alloc_and_inserting_missing_dealloc(
   return
 }
 
-// CHECK-NEXT: %{{.*}} = alloc()
-// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb1
+//      CHECK: ^bb1
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC1]]
+// CHECK-NEXT: ^bb2
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: dealloc %[[ALLOC2]]
+// CHECK-NEXT: br ^bb3(%[[ALLOC3]]
+// CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}})
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC4]]
 // CHECK-NEXT: return
 
 // -----
@@ -551,8 +582,8 @@ func @moving_alloc_and_inserting_missing_dealloc(
 // bb1  bb2 <- Initial position of AllocOp
 //  \   /
 //   bb3
-// BufferPlacement Expected Behaviour: It should move the AllocOp to the entry
-// block. The existing DeallocOp should be moved to exit block.
+// BufferDeallocation expected behavior: The existing DeallocOp should be
+// moved to exit block.
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -561,11 +592,11 @@ func @moving_invalid_dealloc_op_complex(
   %cond: i1,
     %arg0: memref<2xf32>,
     %arg1: memref<2xf32>) {
+  %1 = alloc() : memref<2xf32>
   cond_br %cond, ^bb1, ^bb2
 ^bb1:
   br ^exit(%arg0 : memref<2xf32>)
 ^bb2:
-  %1 = alloc() : memref<2xf32>
   linalg.generic {
     indexing_maps = [#map0, #map0],
     iterator_types = ["parallel"]}
@@ -582,9 +613,10 @@ func @moving_invalid_dealloc_op_complex(
   return
 }
 
-// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: cond_br
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 // CHECK-NEXT: return
 
 // -----
@@ -611,8 +643,9 @@ func @inserting_missing_dealloc_simple(
   return
 }
 
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 
 // -----
 
@@ -638,33 +671,41 @@ func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>) {
   return
 }
 
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
 //      CHECK: linalg.copy
-// CHECK-NEXT: dealloc
+// CHECK-NEXT: dealloc %[[ALLOC0]]
 
 // -----
 
-// Test Case: Nested regions - This test defines a GenericOp inside the region of
-// another GenericOp.
-// BufferPlacement Expected Behaviour: The AllocOp of inner GenericOp should remain
-// inside the region of outer GenericOp and it should insert the missing DeallocOp
-// in the same region. The AllocOp of the outer GenericOp should be moved to the
-// entry block and its missing DeallocOp should be inserted after Linalg.Copy.
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferDeallocation expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp and it should insert the missing
+// DeallocOp in the same region. The missing DeallocOp should be inserted after
+// Linalg.Copy.
 
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @nested_regions_and_cond_branch
-func @nested_regions_and_cond_branch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
 ^bb2:
   %0 = alloc() : memref<2xf32>
-  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%0: memref<2xf32>) {
   ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
     %1 = alloc() : memref<2xf32>
-    linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
       ins(%arg1: memref<2xf32>)
      outs(%1: memref<2xf32>) {
     ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
@@ -680,29 +721,37 @@ func @nested_regions_and_cond_branch(%arg0: i1, %arg1: memref<2xf32>, %arg2: mem
   return
 }
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
-// CHECK-NEXT:   %[[GENERIC1_ALLOC:.*]] = alloc()
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
+//      CHECK:   %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT:   linalg.copy(%[[ARG1]], %[[ALLOC0]])
 //      CHECK: ^[[BB2]]:
-// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC1_ALLOC]]
-//      CHECK:     %[[GENERIC2_ALLOC:.*]] = alloc()
-// CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[GENERIC2_ALLOC]]
-//      CHECK:     dealloc %[[GENERIC2_ALLOC]]
+//      CHECK:   %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]]
+//      CHECK:     %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC2]]
+//      CHECK:     dealloc %[[ALLOC2]]
 // CHECK-NEXT:     %{{.*}} = exp
+//      CHECK:   %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT:   linalg.copy(%[[ALLOC1]], %[[ALLOC3]])
+// CHECK-NEXT:   dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  linalg.copy
-// CHECK-NEXT:  dealloc %[[GENERIC1_ALLOC]]
+// CHECK-NEXT:  dealloc
 
 // -----
 
 // Test Case: buffer deallocation escaping
-// BufferPlacement Expected Behaviour: It must not dealloc %arg1 and %x
+// BufferDeallocation expected behavior: It must not dealloc %arg1 and %x
 // since they are operands of return operation and should escape from
 // deallocating. It should dealloc %y after linalg.copy.
 
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @memref_in_function_results
-func @memref_in_function_results(%arg0: memref<5xf32>, %arg1: memref<10xf32>, %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
+func @memref_in_function_results(
+  %arg0: memref<5xf32>,
+  %arg1: memref<10xf32>,
+  %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
   %x = alloc() : memref<15xf32>
   %y = alloc() : memref<5xf32>
   linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
@@ -715,19 +764,20 @@ func @memref_in_function_results(%arg0: memref<5xf32>, %arg1: memref<10xf32>, %a
   linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
   return %arg1, %x : memref<10xf32>, memref<15xf32>
 }
-// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, %[[RESULT:.*]]: memref<5xf32>)
-// CHECK: %[[X:.*]] = alloc()
-// CHECK: %[[Y:.*]] = alloc()
-// CHECK: linalg.copy
-// CHECK: dealloc %[[Y]]
-// CHECK: return %[[ARG1]], %[[X]]
+//      CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>,
+// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>)
+//      CHECK: %[[X:.*]] = alloc()
+//      CHECK: %[[Y:.*]] = alloc()
+//      CHECK: linalg.copy
+//      CHECK: dealloc %[[Y]]
+//      CHECK: return %[[ARG1]], %[[X]]
 
 // -----
 
 // Test Case: nested region control flow
-// The alloc position of %1 does not need to be changed and flows through
-// both if branches until it is finally returned. Hence, it does not
-// require a specific dealloc operation. However, %3 requires a dealloc.
+// The alloc %1 flows through both if branches until it is finally returned.
+// Hence, it does not require a specific dealloc operation. However, %3
+// requires a dealloc.
 
 // CHECK-LABEL: func @nested_region_control_flow
 func @nested_region_control_flow(
@@ -756,10 +806,8 @@ func @nested_region_control_flow(
 
 // Test Case: nested region control flow with a nested buffer allocation in a
 // divergent branch.
-// The alloc positions of %1, %3 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %3 is allocated and "returned" in a divergent branch, we have
-// to allocate a temporary buffer (like in condBranchDynamicTypeNested).
+// Buffer deallocation places a copy for both  %1 and %3, since they are
+// returned in the end.
 
 // CHECK-LABEL: func @nested_region_control_flow_div
 func @nested_region_control_flow_div(
@@ -791,61 +839,9 @@ func @nested_region_control_flow_div(
 
 // -----
 
-// Test Case: deeply nested region control flow with a nested buffer allocation
-// in a divergent branch.
-// The alloc positions of %1, %4 and %5 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %4 is allocated and "returned" in a divergent branch, we have
-// to allocate several temporary buffers (like in condBranchDynamicTypeNested).
-
-// CHECK-LABEL: func @nested_region_control_flow_div_nested
-func @nested_region_control_flow_div_nested(
-  %arg0 : index,
-  %arg1 : index) -> memref<?x?xf32> {
-  %0 = cmpi "eq", %arg0, %arg1 : index
-  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
-  %2 = scf.if %0 -> (memref<?x?xf32>) {
-    %3 = scf.if %0 -> (memref<?x?xf32>) {
-      scf.yield %1 : memref<?x?xf32>
-    } else {
-      %4 = alloc(%arg0, %arg1) : memref<?x?xf32>
-      scf.yield %4 : memref<?x?xf32>
-    }
-    scf.yield %3 : memref<?x?xf32>
-  } else {
-    %5 = alloc(%arg1, %arg1) : memref<?x?xf32>
-    scf.yield %5 : memref<?x?xf32>
-  }
-  return %2 : memref<?x?xf32>
-}
-//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
-// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
-// CHECK-NEXT: %[[ALLOC2:.*]] = scf.if
-//      CHECK: %[[ALLOC3:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC3]])
-//      CHECK: scf.yield %[[ALLOC3]]
-//      CHECK: %[[ALLOC4:.*]] = alloc(%arg0, %arg1)
-//      CHECK: %[[ALLOC5:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]])
-//      CHECK: dealloc %[[ALLOC4]]
-//      CHECK: scf.yield %[[ALLOC5]]
-//      CHECK: %[[ALLOC6:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC6]])
-//      CHECK: dealloc %[[ALLOC2]]
-//      CHECK: scf.yield %[[ALLOC6]]
-//      CHECK: %[[ALLOC7:.*]] = alloc(%arg1, %arg1)
-//      CHECK: %[[ALLOC8:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]])
-//      CHECK: dealloc %[[ALLOC7]]
-//      CHECK: scf.yield %[[ALLOC8]]
-//      CHECK: dealloc %[[ALLOC0]]
-// CHECK-NEXT: return %[[ALLOC1]]
-
-// -----
-
 // Test Case: nested region control flow within a region interface.
-// The alloc positions of %0 does not need to be changed and no copies are
-// required in this case since the allocation finally escapes the method.
+// No copies are required in this case since the allocation finally escapes
+// the method.
 
 // CHECK-LABEL: func @inner_region_control_flow
 func @inner_region_control_flow(%arg0 : index) -> memref<?x?xf32> {
@@ -875,54 +871,6 @@ func @inner_region_control_flow(%arg0 : index) -> memref<?x?xf32> {
 
 // -----
 
-// Test Case: nested region control flow within a region interface including an
-// allocation in a divergent branch.
-// The alloc positions of %1 and %2 does not need to be changed since
-// BufferPlacement does not move allocs out of nested regions at the moment.
-// However, since %2 is allocated and yielded in a divergent branch, we have
-// to allocate several temporary buffers (like in condBranchDynamicTypeNested).
-
-// CHECK-LABEL: func @inner_region_control_flow_div
-func @inner_region_control_flow_div(
-  %arg0 : index,
-  %arg1 : index) -> memref<?x?xf32> {
-  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
-  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      test.region_if_yield %arg2 : memref<?x?xf32>
-  } else {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
-      test.region_if_yield %2 : memref<?x?xf32>
-  } join {
-    ^bb0(%arg2 : memref<?x?xf32>):
-      test.region_if_yield %arg2 : memref<?x?xf32>
-  }
-  return %1 : memref<?x?xf32>
-}
-
-//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
-// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if
-// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC3:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC3]])
-// CHECK-NEXT: test.region_if_yield %[[ALLOC3]]
-//      CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC5:.*]] = alloc
-//      CHECK: %[[ALLOC6:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC5]], %[[ALLOC6]])
-// CHECK-NEXT: dealloc %[[ALLOC5]]
-// CHECK-NEXT: test.region_if_yield %[[ALLOC6]]
-//      CHECK: ^bb0(%[[ALLOC7:.*]]:{{.*}}):
-//      CHECK: %[[ALLOC8:.*]] = alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC7]], %[[ALLOC8]])
-// CHECK-NEXT: dealloc %[[ALLOC7]]
-// CHECK-NEXT: test.region_if_yield %[[ALLOC8]]
-//      CHECK: dealloc %[[ALLOC0]]
-// CHECK-NEXT: return %[[ALLOC1]]
-
-// -----
-
 // CHECK-LABEL: func @subview
 func @subview(%arg0 : index, %arg1 : index, %arg2 : memref<?x?xf32>) {
   %0 = alloc() : memref<64x4xf32, offset: 0, strides: [4, 1]>
@@ -944,6 +892,9 @@ func @subview(%arg0 : index, %arg1 : index, %arg2 : memref<?x?xf32>) {
 
 #map0 = affine_map<(d0) -> (d0)>
 
+// Test Case: In the presence of AllocaOps only the AllocOps has top be freed.
+// Therefore, all allocas are not handled.
+
 // CHECK-LABEL: func @condBranchAlloca
 func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
@@ -977,6 +928,10 @@ func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 
 #map0 = affine_map<(d0) -> (d0)>
 
+// Test Case: In the presence of AllocaOps only the AllocOps has top be freed.
+// Therefore, all allocas are not handled. In this case, only alloc %0 has a
+// dealloc.
+
 // CHECK-LABEL: func @ifElseAlloca
 func @ifElseAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   %0 = alloc() : memref<2xf32>
@@ -1024,7 +979,10 @@ func @ifElseAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @ifElseNestedAlloca
-func @ifElseNestedAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @ifElseNestedAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   %0 = alloca() : memref<2xf32>
   linalg.generic {
     indexing_maps = [#map0, #map0],
@@ -1074,18 +1032,25 @@ func @ifElseNestedAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>)
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca
-func @nestedRegionsAndCondBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+func @nestedRegionsAndCondBranchAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
 ^bb2:
   %0 = alloc() : memref<2xf32>
-  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%0: memref<2xf32>) {
   ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
     %1 = alloca() : memref<2xf32>
-    linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
     ins(%arg1: memref<2xf32>)
    outs(%1: memref<2xf32>) {
     ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
@@ -1101,16 +1066,22 @@ func @nestedRegionsAndCondBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: m
   return
 }
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
-// CHECK-NEXT:   %[[ALLOC:.*]] = alloc()
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
+//      CHECK: ^[[BB1]]:
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.copy
 //      CHECK: ^[[BB2]]:
-// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC]]
+//      CHECK:   %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:   linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOC1]]
 //      CHECK:     %[[ALLOCA:.*]] = alloca()
 // CHECK-NEXT:     linalg.generic {{{.*}}} ins(%[[ARG1]]{{.*}}outs(%[[ALLOCA]]
 //      CHECK:     %{{.*}} = exp
+//      CHECK:  %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT:  linalg.copy
+// CHECK-NEXT:  dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  linalg.copy
-// CHECK-NEXT:  dealloc %[[ALLOC]]
+// CHECK-NEXT:  dealloc
 
 // -----
 
@@ -1139,10 +1110,8 @@ func @nestedRegionControlFlowAlloca(
 // -----
 
 // Test Case: structured control-flow loop using a nested alloc.
-// The alloc positions of %3 will not be changed, but the iteration argument
-// %iterBuf has to be freed before yielding %3 to avoid memory leaks.
-
-// -----
+// The iteration argument %iterBuf has to be freed before yielding %3 to avoid
+// memory leaks.
 
 // CHECK-LABEL: func @loop_alloc
 func @loop_alloc(
@@ -1166,7 +1135,8 @@ func @loop_alloc(
 // CHECK-NEXT: dealloc %[[ALLOC0]]
 // CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
 //      CHECK: linalg.copy(%arg3, %[[ALLOC1]])
-//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
+// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK:    cmpi
 //      CHECK:    dealloc %[[IALLOC]]
 //      CHECK:    %[[ALLOC3:.*]] = alloc()
@@ -1251,7 +1221,8 @@ func @loop_nested_if_alloc(
 //      CHECK: %[[ALLOC0:.*]] = alloc()
 //      CHECK: %[[ALLOC1:.*]] = alloc()
 // CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
-// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
+// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK: dealloc %[[IALLOC]]
 //      CHECK: %[[ALLOC3:.*]] = scf.if
 
@@ -1362,7 +1333,7 @@ func @loop_nested_alloc(
 // -----
 
 // Test Case: explicit control-flow loop with a dynamically allocated buffer.
-// The BufferPlacement transformation should fail on this explicit
+// The BufferDeallocation transformation should fail on this explicit
 // control-flow loop since they are not supported.
 
 // CHECK-LABEL: func @loop_dynalloc
@@ -1397,7 +1368,7 @@ func @loop_dynalloc(
 // -----
 
 // Test Case: explicit control-flow loop with a dynamically allocated buffer.
-// The BufferPlacement transformation should fail on this explicit
+// The BufferDeallocation transformation should fail on this explicit
 // control-flow loop since they are not supported.
 
 // CHECK-LABEL: func @do_loop_alloc

diff  --git a/mlir/test/Transforms/buffer-hoisting.mlir b/mlir/test/Transforms/buffer-hoisting.mlir
new file mode 100644
index 000000000000..bce5f4370ed3
--- /dev/null
+++ b/mlir/test/Transforms/buffer-hoisting.mlir
@@ -0,0 +1,905 @@
+// RUN: mlir-opt -buffer-hoisting -split-input-file %s | FileCheck %s
+
+// This file checks the behaviour of BufferHoisting pass for moving Alloc
+// operations to their correct positions.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It should move the existing AllocOp to
+// the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<?xf32>)
+     outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//      bb0
+//     /    \
+//   bb1    bb2 <- Initial position of AllocOp
+//    |     /  \
+//    |   bb3  bb4
+//    |     \  /
+//    \     bb5
+//     \    /
+//       bb6
+//        |
+//       bb7
+// BufferHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicTypeNested
+func @condBranchDynamicTypeNested(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb6(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<?xf32>)
+     outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0, ^bb3, ^bb4
+^bb3:
+  br ^bb5(%1 : memref<?xf32>)
+^bb4:
+  br ^bb5(%1 : memref<?xf32>)
+^bb5(%2: memref<?xf32>):
+  br ^bb6(%2 : memref<?xf32>)
+^bb6(%3: memref<?xf32>):
+  br ^bb7(%3 : memref<?xf32>)
+^bb7(%4: memref<?xf32>):
+  "linalg.copy"(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  |    bb1 <- Initial position of AllocOp
+//   \   /
+//    bb2
+// BufferHoisting expected behavior: It should move the existing AllocOp to
+// the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @criticalEdge
+func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of the first AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3 <- Initial position of the second AllocOp
+// BufferHoisting expected behavior: It shouldn't move the AllocOps.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElse
+func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  %7 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%7: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb3
+//      CHECK: br ^bb3
+// CHECK-NEXT: ^bb3
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOC1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: No users for buffer in if-else CFG
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3
+// BufferHoisting expected behavior: It shouldn't move the AllocOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNoUsers
+func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//      bb0 <- Initial position of the first AllocOp
+//     /    \
+//   bb1    bb2
+//    |     /  \
+//    |   bb3  bb4
+//    \     \  /
+//     \     /
+//       bb5 <- Initial position of the second AllocOp
+// BufferHoisting expected behavior: AllocOps shouldn't be moved.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNested
+func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb5
+//      CHECK: br ^bb5
+//      CHECK: br ^bb5
+// CHECK-NEXT: ^bb5
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case: Dead operations in a single block.
+// BufferHoisting expected behavior: It shouldn't move the AllocOps.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @redundantOperations
+func @redundantOperations(%arg0: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case:
+//                                     bb0
+//                                    /   \
+// Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
+//                                    \   /
+//                                     bb3
+// BufferHoisting expected behavior: Both AllocOps should be moved to the
+// entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
+func @moving_alloc_and_inserting_missing_dealloc(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^exit(%0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case: Invalid position of the DeallocOp. There is a user after
+// deallocation.
+//   bb0
+//  /   \
+// bb1  bb2 <- Initial position of AllocOp
+//  \   /
+//   bb3
+// BufferHoisting expected behavior: It should move the AllocOp to the entry
+// block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_invalid_dealloc_op_complex
+func @moving_invalid_dealloc_op_complex(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  br ^exit(%arg0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg0: memref<2xf32>)
+     outs(%1: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  dealloc %1 : memref<2xf32>
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %{{.*}} = alloc()
+// CHECK-NEXT: cond_br
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferHoisting expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp. The AllocOp of the outer
+// GenericOp should be moved to the entry block.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+        ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT:   cond_br
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: nested region control flow
+// The alloc position of %1 does not need to be changed and flows through
+// both if branches until it is finally returned.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %{{.*}} = scf.if
+//      CHECK: else
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+
+// -----
+
+// Test Case: nested region control flow with a nested buffer allocation in a
+// divergent branch.
+// The alloc positions of %1 does not need to be changed. %3 is moved upwards.
+
+// CHECK-LABEL: func @nested_region_control_flow_div
+func @nested_region_control_flow_div(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %3 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: %{{.*}} = scf.if
+
+// -----
+
+// Test Case: deeply nested region control flow with a nested buffer allocation
+// in a divergent branch.
+// The alloc position of %1 does not need to be changed. Allocs %4 and %5 are
+// moved upwards.
+
+// CHECK-LABEL: func @nested_region_control_flow_div_nested
+func @nested_region_control_flow_div_nested(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    %3 = scf.if %0 -> (memref<?x?xf32>) {
+      scf.yield %1 : memref<?x?xf32>
+    } else {
+      %4 = alloc(%arg0, %arg1) : memref<?x?xf32>
+      scf.yield %4 : memref<?x?xf32>
+    }
+    scf.yield %3 : memref<?x?xf32>
+  } else {
+    %5 = alloc(%arg1, %arg1) : memref<?x?xf32>
+    scf.yield %5 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc(%arg1, %arg1)
+// CHECK-NEXT: %{{.*}} = scf.if
+
+// -----
+
+// Test Case: nested region control flow within a region interface.
+// The alloc positions of %0 does not need to be changed.
+
+// CHECK-LABEL: func @inner_region_control_flow
+func @inner_region_control_flow(%arg0 : index) -> memref<?x?xf32> {
+  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  } else {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  } join {
+    ^bb0(%arg1 : memref<?x?xf32>):
+      test.region_if_yield %arg1 : memref<?x?xf32>
+  }
+  return %1 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: {{.*}} test.region_if
+
+// -----
+
+// Test Case: nested region control flow within a region interface including an
+// allocation in a divergent branch.
+// The alloc positions of %0 does not need to be changed. %2 is moved upwards.
+
+// CHECK-LABEL: func @inner_region_control_flow_div
+func @inner_region_control_flow_div(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %1 = test.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>) then {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      test.region_if_yield %arg2 : memref<?x?xf32>
+  } else {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+      test.region_if_yield %2 : memref<?x?xf32>
+  } join {
+    ^bb0(%arg2 : memref<?x?xf32>):
+      test.region_if_yield %arg2 : memref<?x?xf32>
+  }
+  return %1 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: {{.*}} test.region_if
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved.
+
+// CHECK-LABEL: func @condBranchAlloca
+func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloca() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved. The alloc operation also
+// shouldn't be moved analogously to the ifElseNested test.
+
+// CHECK-LABEL: func @ifElseNestedAlloca
+func @ifElseNestedAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  %0 = alloca() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+      ins(%7: memref<2xf32>)
+     outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: ^bb5
+//      CHECK: ^bb5
+//      CHECK: ^bb5
+// CHECK-NEXT: ^bb5
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// Test Case: Alloca operations shouldn't be moved. The alloc operation should
+// be moved in the beginning analogous to the nestedRegionsAndCondBranch test.
+
+// CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca
+func @nestedRegionsAndCondBranchAlloca(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+     outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloca() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+        ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT:   cond_br
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 will be moved upwards.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocation %4 is not moved upwards.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: several nested structured control-flow loops with a deeply nested
+// buffer allocation inside an if operation.
+// Same behavior is an loop_nested_if_alloc: The allocs are not moved upwards.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          scf.yield %7 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+
+// -----
+
+// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency
+func @loop_nested_alloc_dyn_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %arg0: index,
+  %buf: memref<?xf32>,
+  %res: memref<?xf32>) {
+  %0 = alloc(%arg0) : memref<?xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<?xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<?xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<?xf32> {
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<?xf32>) {
+          %7 = alloc(%i3) : memref<?xf32>
+          scf.yield %7 : memref<?xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<?xf32>
+        }
+        scf.yield %6 : memref<?xf32>
+      }
+      scf.yield %3 : memref<?xf32>
+    }
+    scf.yield %0 : memref<?xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc({{.*}})

diff  --git a/mlir/test/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Transforms/buffer-loop-hoisting.mlir
new file mode 100644
index 000000000000..bcb44efb5fea
--- /dev/null
+++ b/mlir/test/Transforms/buffer-loop-hoisting.mlir
@@ -0,0 +1,490 @@
+// RUN: mlir-opt -buffer-loop-hoisting -split-input-file %s | FileCheck %s
+
+// This file checks the behavior of BufferLoopHoisting pass for moving Alloc
+// operations in their correct positions.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferLoopHoisting expected behavior: It should not move the AllocOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+    outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: %[[ALLOC:.*]] = alloc()
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// BufferLoopHoisting expected behavior: It should not move the existing AllocOp
+// to any other block since the alloc has a dynamic dependency to block argument
+// %0 in bb2.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<?xf32>)
+    outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// BufferLoopHoisting expected behavior: The AllocOp of inner GenericOp should
+// remain inside the region of outer GenericOp. The AllocOp of the outer
+// GenericOp should not be moved during this pass.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+    outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+      outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+// CHECK-NEXT:   cond_br
+//      CHECK:   %[[ALLOC0:.*]] = alloc()
+//      CHECK:   linalg.generic
+//      CHECK:     %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT:     linalg.generic
+
+// -----
+
+// Test Case: nested region control flow
+// The alloc position of %1 does not need to be changed and flows through
+// both if branches until it is finally returned.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %{{.*}} = scf.if
+//      CHECK: else
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc(%arg0, %arg1)
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 should not be changed.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocation %4 should not be moved upwards due to a back-edge dependency.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: {{.*}} scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+
+// -----
+
+// Test Case: several nested structured control-flow loops with deeply nested
+// buffer allocations inside an if operation.
+// Behavior: The allocs %0, %4 and %9 are moved upwards, while %7 and %8 stay
+// in their positions.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          %8 = alloc() : memref<2xf32>
+          scf.yield %8 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        %9 = alloc() : memref<2xf32>
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: {{.*}} = scf.if
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+//      CHECK: %[[ALLOC4:.*]] = alloc()
+
+// -----
+
+// CHECK-LABEL: func @loop_nested_alloc_dyn_dependency
+func @loop_nested_alloc_dyn_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %arg0: index,
+  %buf: memref<?xf32>,
+  %res: memref<?xf32>) {
+  %0 = alloc(%arg0) : memref<?xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<?xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<?xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<?xf32> {
+        %4 = alloc(%i3) : memref<?xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<?xf32>) {
+          %7 = alloc(%i3) : memref<?xf32>
+          scf.yield %7 : memref<?xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<?xf32>
+        }
+        %8 = alloc(%i3) : memref<?xf32>
+        scf.yield %6 : memref<?xf32>
+      }
+      scf.yield %3 : memref<?xf32>
+    }
+    scf.yield %0 : memref<?xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: {{.*}} = scf.for
+//      CHECK: %[[ALLOC1:.*]] = alloc({{.*}})
+//      CHECK: %[[ALLOC2:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_one_loop
+func @hoist_one_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %2 = alloc() : memref<2xf32>
+      scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop
+func @no_hoist_one_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %1 = alloc() : memref<2xf32>
+      scf.yield %1 : memref<2xf32>
+  }
+  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_multiple_loop
+func @hoist_multiple_loop(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+        %3 = alloc() : memref<2xf32>
+        scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop_conditional
+func @no_hoist_one_loop_conditional(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %1 = cmpi "eq", %i, %ub : index
+      %2 = scf.if %1 -> (memref<2xf32>) {
+        %3 = alloc() : memref<2xf32>
+        scf.yield %3 : memref<2xf32>
+      } else {
+        scf.yield %iterBuf : memref<2xf32>
+      }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.for
+//      CHECK: {{.*}} = scf.if
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @hoist_one_loop_conditional
+func @hoist_one_loop_conditional(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = cmpi "eq", %lb, %ub : index
+  %2 = scf.if %1 -> (memref<2xf32>) {
+    %3 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  else
+  {
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%2, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: {{.*}} = scf.if
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc({{.*}})
+//      CHECK: {{.*}} = scf.for
+
+// -----
+
+// CHECK-LABEL: func @no_hoist_one_loop_dependency
+func @no_hoist_one_loop_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      %2 = alloc(%i) : memref<?xf32>
+      scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+
+// -----
+
+// CHECK-LABEL: func @partial_hoist_multiple_loop_dependency
+func @partial_hoist_multiple_loop_dependency(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+        %3 = alloc(%i) : memref<?xf32>
+        scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %0 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc({{.*}})
+// CHECK-NEXT: {{.*}} = scf.for