[Mlir-commits] [mlir] 0d1d363 - [MLIR] Added PromoteBuffersToStackPass to convert heap- to stack-based allocations.

Fri Oct 23 03:11:49 PDT 2020

Author: Julian Gross
Date: 2020-10-23T12:02:25+02:00
New Revision: 0d1d363c51c93614783755426cb58c819c164fab

URL: https://github.com/llvm/llvm-project/commit/0d1d363c51c93614783755426cb58c819c164fab
DIFF: https://github.com/llvm/llvm-project/commit/0d1d363c51c93614783755426cb58c819c164fab.diff

LOG: [MLIR] Added PromoteBuffersToStackPass to convert heap- to stack-based allocations.

Added optimization pass to convert heap-based allocs to stack-based allocas in
buffer placement. Added the corresponding test file.

Differential Revision: https://reviews.llvm.org/D89688

Added: 
    mlir/test/Transforms/promote-buffers-to-stack.mlir

Modified: 
    mlir/include/mlir/Transforms/Passes.h
    mlir/include/mlir/Transforms/Passes.td
    mlir/lib/Transforms/BufferOptimizations.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 54d348fd3a63..b02915e5ac75 100644

--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -40,6 +40,10 @@ std::unique_ptr<Pass> createBufferHoistingPass();
 /// reallocations inside of loops.
 std::unique_ptr<Pass> createBufferLoopHoistingPass();
 
+/// Creates a pass that promotes heap-based allocations to stack-based ones.
+std::unique_ptr<Pass>
+createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes = 1024);
+
 /// Creates an instance of the Canonicalizer pass.
 std::unique_ptr<Pass> createCanonicalizerPass();
 

diff  --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 8174ecd2021a..f63290fb264a 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -201,6 +201,22 @@ def BufferLoopHoisting : FunctionPass<"buffer-loop-hoisting"> {
   let constructor = "mlir::createBufferLoopHoistingPass()";
 }
 
+def PromoteBuffersToStack : FunctionPass<"promote-buffers-to-stack"> {
+  let summary = "Promotes heap-based allocations to automatically managed "
+                "stack-based allocations";
+  let description = [{
+    This pass implements a simple algorithm to convert heap-based memory
+    allocations to stack-based ones. It uses a built-in heuristic to decide
+    whether it makes sense to convert an allocation.
+  }];
+  let constructor = "mlir::createPromoteBuffersToStackPass()";
+  let options = [
+    Option<"maxAllocSizeInBytes", "max-alloc-size-in-bytes", "unsigned",
+           /*default=*/"1024",
+           "Define the maximum size in bytes to promote allocations to stack.">,
+  ];
+}
+
 def Canonicalizer : Pass<"canonicalize"> {
   let summary = "Canonicalize operations";
   let description = [{

diff  --git a/mlir/lib/Transforms/BufferOptimizations.cpp b/mlir/lib/Transforms/BufferOptimizations.cpp
index 2686d35ee323..cd69a7cc9f13 100644
--- a/mlir/lib/Transforms/BufferOptimizations.cpp
+++ b/mlir/lib/Transforms/BufferOptimizations.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements logic for two optimization passes. These passes try to
-// hoist alloc nodes to reduce the number of allocations and copies during
-// buffer deallocation.
+// This file implements logic for three optimization passes. The first two
+// passes try to move alloc nodes out of blocks to reduce the number of
+// allocations and copies during buffer deallocation. The third pass tries to
+// convert heap-based allocations to stack-based allocations, if possible.
 
 #include "PassDetail.h"
 #include "mlir/IR/Operation.h"
@@ -19,6 +20,63 @@
 
 using namespace mlir;
 
+/// Returns true if the given operation implements a known high-level region-
+/// based control-flow interface.
+static bool isKnownControlFlowInterface(Operation *op) {
+  return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
+}
+
+/// Check if the size of the allocation is less than the given size. The
+/// transformation is only applied to small buffers since large buffers could
+/// exceed the stack space.
+static bool isSmallAlloc(Value alloc, unsigned maximumSizeInBytes) {
+  auto type = alloc.getType().dyn_cast<ShapedType>();
+  if (!type || !type.hasStaticShape())
+    return false;
+  return type.getSizeInBits() < maximumSizeInBytes * 8;
+}
+
+/// Checks whether the given aliases leave the allocation scope.
+static bool
+leavesAllocationScope(Region *parentRegion,
+                      const BufferPlacementAliasAnalysis::ValueSetT &aliases) {
+  for (Value alias : aliases) {
+    for (auto *use : alias.getUsers()) {
+      // If there is at least one alias that leaves the parent region, we know
+      // that this alias escapes the whole region and hence the associated
+      // allocation leaves allocation scope.
+      if (use->hasTrait<OpTrait::ReturnLike>() &&
+          use->getParentRegion() == parentRegion)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Checks, if an automated allocation scope for a given alloc value exists.
+static bool
+hasAllocationScope(Value alloc,
+                   const BufferPlacementAliasAnalysis &aliasAnalysis) {
+  Region *region = alloc.getParentRegion();
+  do {
+    if (Operation *parentOp = region->getParentOp()) {
+      // Check if the operation is an automatic allocation scope and whether an
+      // alias leaves the scope. This means, an allocation yields out of
+      // this scope and can not be transformed in a stack-based allocation.
+      if (parentOp->hasTrait<OpTrait::AutomaticAllocationScope>() &&
+          !leavesAllocationScope(region, aliasAnalysis.resolve(alloc)))
+        return true;
+      // Check if the operation is a known control flow interface and break the
+      // loop to avoid transformation in loops. Furthermore skip transformation
+      // if the operation does not implement a RegionBeanchOpInterface.
+      if (BufferPlacementTransformationBase::isLoop(parentOp) ||
+          !isKnownControlFlowInterface(parentOp))
+        break;
+    }
+  } while ((region = region->getParentRegion()));
+  return false;
+}
+
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -46,13 +104,6 @@ struct BufferAllocationHoistingStateBase {
 /// Implements the actual hoisting logic for allocation nodes.
 template <typename StateT>
 class BufferAllocationHoisting : public BufferPlacementTransformationBase {
-private:
-  /// Returns true if the given operation implements a known high-level region-
-  /// based control-flow interface.
-  static bool isKnownControlFlowInterface(Operation *op) {
-    return isa<LoopLikeOpInterface, RegionBranchOpInterface>(op);
-  }
-
 public:
   BufferAllocationHoisting(Operation *op)
       : BufferPlacementTransformationBase(op), dominators(op),
@@ -220,6 +271,44 @@ struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
   void recordMoveToParent(Block *block) { placementBlock = block; }
 };
 
+//===----------------------------------------------------------------------===//
+// BufferPlacementPromotion
+//===----------------------------------------------------------------------===//
+
+/// Promotes heap-based allocations to stack-based allocations (if possible).
+class BufferPlacementPromotion : BufferPlacementTransformationBase {
+public:
+  BufferPlacementPromotion(Operation *op)
+      : BufferPlacementTransformationBase(op) {}
+
+  /// Promote buffers to stack-based allocations.
+  void promote(unsigned maximumSize) {
+    for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
+      Value alloc = std::get<0>(entry);
+      // Checking several requirements to transform an AllocOp into an AllocaOp.
+      // The transformation is done if the allocation is limited to a given
+      // size. Furthermore, a deallocation must not be defined for this
+      // allocation entry and a parent allocation scope must exist.
+      if (!isSmallAlloc(alloc, maximumSize) || std::get<1>(entry) ||
+          !hasAllocationScope(alloc, aliases))
+        continue;
+
+      Operation *startOperation = BufferPlacementAllocs::getStartOperation(
+          alloc, alloc.getParentBlock(), liveness);
+      // Build a new alloca that is associated with its parent
+      // `AutomaticAllocationScope` determined during the initialization phase.
+      OpBuilder builder(startOperation);
+      auto alloca = builder.create<AllocaOp>(
+          alloc.getLoc(), alloc.getType().cast<MemRefType>());
+
+      // Replace the original alloc by a newly created alloca.
+      Operation *allocOp = alloc.getDefiningOp();
+      allocOp->replaceAllUsesWith(alloca.getOperation());
+      allocOp->erase();
+    }
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // BufferOptimizationPasses
 //===----------------------------------------------------------------------===//
@@ -247,6 +336,24 @@ struct BufferLoopHoistingPass : BufferLoopHoistingBase<BufferLoopHoistingPass> {
   }
 };
 
+/// The promote buffer to stack pass that tries to convert alloc nodes into
+/// alloca nodes.
+struct PromoteBuffersToStackPass
+    : PromoteBuffersToStackBase<PromoteBuffersToStackPass> {
+
+  PromoteBuffersToStackPass(unsigned maxAllocSizeInBytes)
+      : maximumSize(maxAllocSizeInBytes) {}
+
+  void runOnFunction() override {
+    // Move all allocation nodes and convert candidates into allocas.
+    BufferPlacementPromotion optimizer(getFunction());
+    optimizer.promote(maximumSize);
+  }
+
+private:
+  const unsigned maximumSize;
+};
+
 } // end anonymous namespace
 
 std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
@@ -256,3 +363,8 @@ std::unique_ptr<Pass> mlir::createBufferHoistingPass() {
 std::unique_ptr<Pass> mlir::createBufferLoopHoistingPass() {
   return std::make_unique<BufferLoopHoistingPass>();
 }
+
+std::unique_ptr<Pass>
+mlir::createPromoteBuffersToStackPass(unsigned maxAllocSizeInBytes) {
+  return std::make_unique<PromoteBuffersToStackPass>(maxAllocSizeInBytes);
+}

diff  --git a/mlir/test/Transforms/promote-buffers-to-stack.mlir b/mlir/test/Transforms/promote-buffers-to-stack.mlir
new file mode 100644
index 000000000000..2f195943b35a
--- /dev/null
+++ b/mlir/test/Transforms/promote-buffers-to-stack.mlir
@@ -0,0 +1,664 @@
+// RUN: mlir-opt -promote-buffers-to-stack -split-input-file %s | FileCheck %s
+
+// This file checks the behavior of PromoteBuffersToStack pass for converting
+// AllocOps into AllocaOps, if possible.
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior: It should convert %0 into an
+// AllocaOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranch
+func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br {{.*}}
+//      CHECK: ^bb2
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  bb1  bb2 <- Initial position of AllocOp
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior:
+// Since the alloc has dynamic type, it is not converted into an alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @condBranchDynamicType
+func @condBranchDynamicType(
+  %arg0: i1,
+  %arg1: memref<?xf32>,
+  %arg2: memref<?xf32>,
+  %arg3: index) {
+  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
+^bb1:
+  br ^bb3(%arg1 : memref<?xf32>)
+^bb2(%0: index):
+  %1 = alloc(%0) : memref<?xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<?xf32>)
+   outs(%1: memref<?xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%1 : memref<?xf32>)
+^bb3(%2: memref<?xf32>):
+  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br
+//      CHECK: ^bb2
+//      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
+// CHECK-NEXT: %[[ALLOC0:.*]] = alloc(%[[IDX]])
+// CHECK-NEXT: linalg.generic
+//      CHECK: br ^bb3
+// CHECK-NEXT: ^bb3(%[[ALLOC0:.*]]:{{.*}})
+//      CHECK: linalg.copy(%[[ALLOC0]],
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Existing AllocOp with no users.
+// PromoteBuffersToStack expected behavior: It should convert it to an
+// AllocaOp.
+
+// CHECK-LABEL: func @emptyUsesValue
+func @emptyUsesValue(%arg0: memref<4xf32>) {
+  %0 = alloc() : memref<4xf32>
+  return
+}
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0
+//   /   \
+//  |    bb1 <- Initial position of AllocOp
+//   \   /
+//    bb2
+// PromoteBuffersToStack expected behavior: It should convert it into an
+// AllocaOp.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @criticalEdge
+func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br {{.*}}
+//      CHECK: ^bb1
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  |    bb1
+//   \   /
+//    bb2
+// PromoteBuffersToStack expected behavior: It converts the alloc in an alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @invCriticalEdge
+func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>)
+^bb1:
+  br ^bb2(%0 : memref<2xf32>)
+^bb2(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: cond_br
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case:
+//    bb0 <- Initial position of the first AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3 <- Initial position of the second AllocOp
+// PromoteBuffersToStack expected behavior: It converts the allocs into allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElse
+func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  %7 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%5: memref<2xf32>)
+   outs(%7: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%7, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+//      CHECK: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOCA1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: No users for buffer in if-else CFG
+//    bb0 <- Initial position of AllocOp
+//   /   \
+//  bb1  bb2
+//   \   /
+//    bb3
+// PromoteBuffersToStack expected behavior: It converts the alloc into alloca.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNoUsers
+func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>)
+^bb3(%5: memref<2xf32>, %6: memref<2xf32>):
+  "linalg.copy"(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+//      CHECK: return
+
+// -----
+
+// Test Case:
+//      bb0 <- Initial position of the first AllocOp
+//     /    \
+//   bb1    bb2
+//    |     /  \
+//    |   bb3  bb4
+//    \     \  /
+//     \     /
+//       bb5 <- Initial position of the second AllocOp
+// PromoteBuffersToStack expected behavior: The two allocs should be converted
+// into allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @ifElseNested
+func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  cond_br %arg0,
+    ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>),
+    ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>)
+^bb1(%1: memref<2xf32>, %2: memref<2xf32>):
+  br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>)
+^bb2(%3: memref<2xf32>, %4: memref<2xf32>):
+  cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>)
+^bb3(%5: memref<2xf32>):
+  br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>)
+^bb4(%6: memref<2xf32>):
+  br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>)
+^bb5(%7: memref<2xf32>, %8: memref<2xf32>):
+  %9 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%7: memref<2xf32>)
+   outs(%9: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  "linalg.copy"(%9, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+//      CHECK: linalg.generic
+//      CHECK: linalg.copy(%[[ALLOCA1]]
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Dead operations in a single block.
+// PromoteBuffersToStack expected behavior: It converts the two AllocOps into
+// allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @redundantOperations
+func @redundantOperations(%arg0: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%0: memref<2xf32>)
+   outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  return
+}
+
+//      CHECK: (%[[ARG0:.*]]: {{.*}})
+// CHECK-NEXT: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ARG0]]{{.*}} outs(%[[ALLOCA0]]
+//      CHECK: %[[ALLOCA1:.*]] = alloca()
+// CHECK-NEXT: linalg.generic {{{.*}}} ins(%[[ALLOCA0]]{{.*}} outs(%[[ALLOCA1]]
+//      CHECK: return
+
+// -----
+
+// Test Case:
+//                                     bb0
+//                                    /   \
+// Initial pos of the 1st AllocOp -> bb1  bb2 <- Initial pos of the 2nd AllocOp
+//                                    \   /
+//                                     bb3
+// PromoteBuffersToStack expected behavior: Both AllocOps are converted into
+// allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
+func @moving_alloc_and_inserting_missing_dealloc(
+  %cond: i1,
+    %arg0: memref<2xf32>,
+    %arg1: memref<2xf32>) {
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^exit(%0 : memref<2xf32>)
+^bb2:
+  %1 = alloc() : memref<2xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<2xf32>)
+   outs(%1: memref<2xf32>) {
+  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+    %tmp2 = exp %gen2_arg0 : f32
+    linalg.yield %tmp2 : f32
+  }
+  br ^exit(%1 : memref<2xf32>)
+^exit(%arg2: memref<2xf32>):
+  "linalg.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: cond_br {{.*}}
+//      CHECK: ^bb1
+// CHECK-NEXT: %{{.*}} = alloca()
+//      CHECK: ^bb2
+// CHECK-NEXT: %{{.*}} = alloca()
+//      CHECK: linalg.copy
+// CHECK-NEXT: return
+
+// -----
+
+// Test Case: Nested regions - This test defines a GenericOp inside the region
+// of another GenericOp.
+// PromoteBuffersToStack expected behavior: The AllocOps are converted into
+// allocas.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @nested_regions_and_cond_branch
+func @nested_regions_and_cond_branch(
+  %arg0: i1,
+  %arg1: memref<2xf32>,
+  %arg2: memref<2xf32>) {
+  cond_br %arg0, ^bb1, ^bb2
+^bb1:
+  br ^bb3(%arg1 : memref<2xf32>)
+^bb2:
+  %0 = alloc() : memref<2xf32>
+  linalg.generic {
+    indexing_maps = [#map0, #map0],
+    iterator_types = ["parallel"]}
+    ins(%arg1: memref<2xf32>)
+   outs(%0: memref<2xf32>) {
+  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
+    %1 = alloc() : memref<2xf32>
+    linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]}
+      ins(%arg1: memref<2xf32>)
+    outs(%1: memref<2xf32>) {
+    ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
+      %tmp2 = exp %gen2_arg0 : f32
+      linalg.yield %tmp2 : f32
+    }
+    %tmp1 = exp %gen1_arg0 : f32
+    linalg.yield %tmp1 : f32
+  }
+  br ^bb3(%0 : memref<2xf32>)
+^bb3(%1: memref<2xf32>):
+  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT:   cond_br {{.*}}
+//      CHECK:   ^bb2
+// CHECK-NEXT:   %[[ALLOCA0:.*]] = alloca()
+//      CHECK:   ^bb0
+// CHECK-NEXT:   %[[ALLOCA1:.*]] = alloc()
+
+// -----
+
+// Test Case: buffer deallocation escaping
+// PromoteBuffersToStack expected behavior: The first alloc is returned, so
+// there is no conversion allowed. The second alloc is converted, since it
+// only remains in the scope of the function.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @memref_in_function_results
+func @memref_in_function_results(
+  %arg0: memref<5xf32>,
+  %arg1: memref<10xf32>,
+  %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
+  %x = alloc() : memref<15xf32>
+  %y = alloc() : memref<5xf32>
+  linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
+    ins(%arg0: memref<5xf32>)
+   outs(%y: memref<5xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):
+    %2 = exp %arg3 : f32
+    linalg.yield %2 : f32
+  }
+  linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
+  return %arg1, %x : memref<10xf32>, memref<15xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>,
+// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>)
+//      CHECK: %[[ALLOC:.*]] = alloc()
+//      CHECK: %[[ALLOCA:.*]] = alloca()
+//      CHECK: linalg.copy
+//      CHECK: return %[[ARG1]], %[[ALLOC]]
+
+// -----
+
+// Test Case: nested region control flow
+// The allocation in the nested if branch cannot be converted to an alloca
+// due to its dynamic memory allocation behavior.
+
+// CHECK-LABEL: func @nested_region_control_flow
+func @nested_region_control_flow(
+  %arg0 : index,
+  %arg1 : index) -> memref<?x?xf32> {
+  %0 = cmpi "eq", %arg0, %arg1 : index
+  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    scf.yield %1 : memref<?x?xf32>
+  } else {
+    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+    scf.yield %1 : memref<?x?xf32>
+  }
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc(%arg0, %arg0)
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: %[[ALLOC2:.*]] = alloc(%arg0, %arg1)
+// CHECK-NEXT: scf.yield %[[ALLOC0]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: nested region control flow within a region interface.
+// The alloc %0 does not need to be converted in this case since the
+// allocation finally escapes the method.
+
+// CHECK-LABEL: func @inner_region_control_flow
+func @inner_region_control_flow(%arg0 : index) -> memref<2x2xf32> {
+  %0 = alloc() : memref<2x2xf32>
+  %1 = test.region_if %0 : memref<2x2xf32> -> (memref<2x2xf32>) then {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  } else {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  } join {
+    ^bb0(%arg1 : memref<2x2xf32>):
+      test.region_if_yield %arg1 : memref<2x2xf32>
+  }
+  return %1 : memref<2x2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if
+// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC2]]
+//      CHECK: ^bb0(%[[ALLOC3:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC3]]
+//      CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}):
+// CHECK-NEXT: test.region_if_yield %[[ALLOC4]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// Alloc %0 will be converted to an alloca. %3 is not transformed.
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOCA:.*]] = alloca()
+// CHECK-NEXT: scf.for
+//      CHECK: %[[ALLOC:.*]] = alloc()
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation.
+// The loop yields buffers that have been defined outside of the loop and the
+// backeges only use the iteration arguments (or one of its aliases).
+// Therefore, we do not have to (and are not allowed to) free any buffers
+// that are passed via the backedges. The alloc is converted to an AllocaOp.
+
+// CHECK-LABEL: func @loop_nested_if_no_alloc
+func @loop_nested_if_no_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      scf.yield %0 : memref<2xf32>
+    } else {
+      scf.yield %iterBuf : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOCA0:.*]] = alloca()
+// CHECK-NEXT: %[[ALLOCA1:.*]] = scf.for {{.*}} iter_args(%[[IALLOCA:.*]] =
+//      CHECK: %[[ALLOCA2:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOCA0]]
+//      CHECK: scf.yield %[[IALLOCA]]
+//      CHECK: scf.yield %[[ALLOCA2]]
+//      CHECK: linalg.copy(%[[ALLOCA1]], %arg4)
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// The allocs are not converted in this case.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}}
+//      CHECK: %[[ALLOC2:.*]] = scf.if
+//      CHECK: %[[ALLOC3:.*]] = alloc()
+// CHECK-NEXT: scf.yield %[[ALLOC3]]
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: scf.yield %[[ALLOC2]]
+//      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: The allocated buffer is too large and, hence, it is not
+// converted. In the actual implementation the largest size is 1KB.
+
+// CHECK-LABEL: func @large_buffer_allocation
+func @large_buffer_allocation(%arg0: memref<2048xf32>) {
+  %0 = alloc() : memref<2048xf32>
+  "linalg.copy"(%0, %arg0) : (memref<2048xf32>, memref<2048xf32>) -> ()
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
+// CHECK-NEXT: linalg.copy