[Mlir-commits] [mlir] feb0b9c - [mlir] Added support for loops to BufferPlacement transformation.

Wed Sep 9 01:55:05 PDT 2020

Author: Marcel Koester
Date: 2020-09-09T10:53:35+02:00
New Revision: feb0b9c3bba7db6d547b552c3cdaa838559da664

URL: https://github.com/llvm/llvm-project/commit/feb0b9c3bba7db6d547b552c3cdaa838559da664
DIFF: https://github.com/llvm/llvm-project/commit/feb0b9c3bba7db6d547b552c3cdaa838559da664.diff

LOG: [mlir] Added support for loops to BufferPlacement transformation.

The current BufferPlacement transformation cannot handle loops properly. Buffers
passed via backedges will not be freed automatically introducing memory leaks.
This CL adds support for loops to overcome these limitations.

Differential Revision: https://reviews.llvm.org/D85513

Added: 
    

Modified: 
    mlir/lib/Transforms/BufferPlacement.cpp
    mlir/test/Transforms/buffer-placement.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferPlacement.cpp
index 0279129758ab..9f2c254f91e5 100644

--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferPlacement.cpp
@@ -48,11 +48,10 @@
 // will be freed in the end.
 //
 // TODO:
-// The current implementation does not support loops and the resulting code will
-// be invalid with respect to program semantics. The only thing that is
-// currently missing is a high-level loop analysis that allows us to move allocs
-// and deallocs outside of the loop blocks. Furthermore, it doesn't also accept
-// functions which return buffers already.
+// The current implementation does not support explicit-control-flow loops and
+// the resulting code will be invalid with respect to program semantics.
+// However, structured control-flow loops are fully supported. Furthermore, it
+// doesn't accept functions which return buffers already.
 //
 //===----------------------------------------------------------------------===//
 
@@ -77,6 +76,22 @@ static void walkReturnOperations(Region *region, const FuncT &func) {
     }
 }
 
+/// Wrapper for the actual `RegionBranchOpInterface.getSuccessorRegions`
+/// function that initializes the required `operandAttributes` array.
+static void getSuccessorRegions(RegionBranchOpInterface regionInterface,
+                                llvm::Optional<unsigned> index,
+                                SmallVectorImpl<RegionSuccessor> &successors) {
+  // Create a list of null attributes for each operand to comply with the
+  // `getSuccessorRegions` interface definition that requires a single
+  // attribute per operand.
+  SmallVector<Attribute, 2> operandAttributes(
+      regionInterface.getOperation()->getNumOperands());
+
+  // Get all successor regions using the temporarily allocated
+  // `operandAttributes`.
+  regionInterface.getSuccessorRegions(index, operandAttributes, successors);
+}
+
 namespace {
 //===----------------------------------------------------------------------===//
 // BufferPlacementAliasAnalysis
@@ -166,16 +181,10 @@ class BufferPlacementAliasAnalysis {
 
     // Query the RegionBranchOpInterface to find potential successor regions.
     op->walk([&](RegionBranchOpInterface regionInterface) {
-      // Create an empty attribute for each operand to comply with the
-      // `getSuccessorRegions` interface definition that requires a single
-      // attribute per operand.
-      SmallVector<Attribute, 2> operandAttributes(
-          regionInterface.getOperation()->getNumOperands());
-
       // Extract all entry regions and wire all initial entry successor inputs.
       SmallVector<RegionSuccessor, 2> entrySuccessors;
-      regionInterface.getSuccessorRegions(/*index=*/llvm::None,
-                                          operandAttributes, entrySuccessors);
+      getSuccessorRegions(regionInterface, /*index=*/llvm::None,
+                          entrySuccessors);
       for (RegionSuccessor &entrySuccessor : entrySuccessors) {
         // Wire the entry region's successor arguments with the initial
         // successor inputs.
@@ -191,8 +200,8 @@ class BufferPlacementAliasAnalysis {
         // Iterate over all successor region entries that are reachable from the
         // current region.
         SmallVector<RegionSuccessor, 2> successorRegions;
-        regionInterface.getSuccessorRegions(
-            region.getRegionNumber(), operandAttributes, successorRegions);
+        getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                            successorRegions);
         for (RegionSuccessor &successorRegion : successorRegions) {
           // Iterate over all immediate terminator operations and wire the
           // successor inputs with the operands of each terminator.
@@ -209,6 +218,83 @@ class BufferPlacementAliasAnalysis {
   ValueMapT aliases;
 };
 
+//===----------------------------------------------------------------------===//
+// Backedges
+//===----------------------------------------------------------------------===//
+
+/// A straight-forward program analysis which detects loop backedges induced by
+/// explicit control flow.
+class Backedges {
+public:
+  using BlockSetT = SmallPtrSet<Block *, 16>;
+  using BackedgeSetT = llvm::DenseSet<std::pair<Block *, Block *>>;
+
+public:
+  /// Constructs a new backedges analysis using the op provided.
+  Backedges(Operation *op) { recurse(op, op->getBlock()); }
+
+  /// Returns the number of backedges formed by explicit control flow.
+  size_t size() const { return edgeSet.size(); }
+
+  /// Returns the start iterator to loop over all backedges.
+  BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); }
+
+  /// Returns the end iterator to loop over all backedges.
+  BackedgeSetT::const_iterator end() const { return edgeSet.end(); }
+
+private:
+  /// Enters the current block and inserts a backedge into the `edgeSet` if we
+  /// have already visited the current block. The inserted edge links the given
+  /// `predecessor` with the `current` block.
+  bool enter(Block &current, Block *predecessor) {
+    bool inserted = visited.insert(&current).second;
+    if (!inserted)
+      edgeSet.insert(std::make_pair(predecessor, &current));
+    return inserted;
+  }
+
+  /// Leaves the current block.
+  void exit(Block &current) { visited.erase(&current); }
+
+  /// Recurses into the given operation while taking all attached regions into
+  /// account.
+  void recurse(Operation *op, Block *predecessor) {
+    Block *current = op->getBlock();
+    // If the current op implements the `BranchOpInterface`, there can be
+    // cycles in the scope of all successor blocks.
+    if (isa<BranchOpInterface>(op)) {
+      for (Block *succ : current->getSuccessors())
+        recurse(*succ, current);
+    }
+    // Recurse into all distinct regions and check for explicit control-flow
+    // loops.
+    for (Region &region : op->getRegions())
+      recurse(region.front(), current);
+  }
+
+  /// Recurses into explicit control-flow structures that are given by
+  /// the successor relation defined on the block level.
+  void recurse(Block &block, Block *predecessor) {
+    // Try to enter the current block. If this is not possible, we are
+    // currently processing this block and can safely return here.
+    if (!enter(block, predecessor))
+      return;
+
+    // Recurse into all operations and successor blocks.
+    for (auto &op : block.getOperations())
+      recurse(&op, predecessor);
+
+    // Leave the current block.
+    exit(block);
+  }
+
+  /// Stores all blocks that are currently visited and on the processing stack.
+  BlockSetT visited;
+
+  /// Stores all backedges in the format (source, target).
+  BackedgeSetT edgeSet;
+};
+
 //===----------------------------------------------------------------------===//
 // BufferPlacement
 //===----------------------------------------------------------------------===//
@@ -357,9 +443,14 @@ class BufferPlacement {
       for (Value value : it->second) {
         if (valuesToFree.count(value) > 0)
           continue;
-        // Check whether we have to free this particular block argument.
-        if (!dominators.dominates(definingBlock, value.getParentBlock())) {
-          toProcess.emplace_back(value, value.getParentBlock());
+        Block *parentBlock = value.getParentBlock();
+        // Check whether we have to free this particular block argument or
+        // generic value. We have to free the current alias if it is either
+        // defined in a non-dominated block or it is defined in the same block
+        // but the current value is not dominated by the source value.
+        if (!dominators.dominates(definingBlock, parentBlock) ||
+            (definingBlock == parentBlock && value.isa<BlockArgument>())) {
+          toProcess.emplace_back(value, parentBlock);
           valuesToFree.insert(value);
         } else if (visitedValues.insert(std::make_tuple(value, definingBlock))
                        .second)
@@ -431,22 +522,42 @@ class BufferPlacement {
     // argument belongs to the first block in a region and the parent operation
     // implements the RegionBranchOpInterface.
     Region *argRegion = block->getParent();
+    Operation *parentOp = argRegion->getParentOp();
     RegionBranchOpInterface regionInterface;
     if (!argRegion || &argRegion->front() != block ||
-        !(regionInterface =
-              dyn_cast<RegionBranchOpInterface>(argRegion->getParentOp())))
+        !(regionInterface = dyn_cast<RegionBranchOpInterface>(parentOp)))
       return;
 
     introduceCopiesForRegionSuccessors(
-        regionInterface, argRegion->getParentOp()->getRegions(),
+        regionInterface, argRegion->getParentOp()->getRegions(), blockArg,
         [&](RegionSuccessor &successorRegion) {
           // Find a predecessor of our argRegion.
           return successorRegion.getSuccessor() == argRegion;
-        },
-        [&](RegionSuccessor &successorRegion) {
-          // The operand index will be the argument number.
-          return blockArg.getArgNumber();
         });
+
+    // Check whether the block argument belongs to an entry region of the
+    // parent operation. In this case, we have to introduce an additional copy
+    // for buffer that is passed to the argument.
+    SmallVector<RegionSuccessor, 2> successorRegions;
+    getSuccessorRegions(regionInterface, llvm::None, successorRegions);
+    auto *it =
+        llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) {
+          return successorRegion.getSuccessor() == argRegion;
+        });
+    if (it == successorRegions.end())
+      return;
+
+    // Determine the actual operand to introduce a copy for and rewire the
+    // operand to point to the copy instead.
+    Value operand =
+        regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber())
+            [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()];
+    Value copy = introduceBufferCopy(operand, parentOp);
+
+    auto op = llvm::find(parentOp->getOperands(), operand);
+    assert(op != parentOp->getOperands().end() &&
+           "parentOp does not contain operand");
+    parentOp->setOperand(op.getIndex(), copy);
   }
 
   /// Introduces temporary allocs in front of all associated nested-region
@@ -455,42 +566,34 @@ class BufferPlacement {
     // Get the actual result index in the scope of the parent terminator.
     Operation *operation = value.getDefiningOp();
     auto regionInterface = cast<RegionBranchOpInterface>(operation);
-    introduceCopiesForRegionSuccessors(
-        regionInterface, operation->getRegions(),
-        [&](RegionSuccessor &successorRegion) {
-          // Determine whether this region has a successor entry that leaves
-          // this region by returning to its parent operation.
-          return !successorRegion.getSuccessor();
-        },
-        [&](RegionSuccessor &successorRegion) {
-          // Find the associated success input index.
-          return llvm::find(successorRegion.getSuccessorInputs(), value)
-              .getIndex();
-        });
+    // Filter successors that return to the parent operation.
+    auto regionPredicate = [&](RegionSuccessor &successorRegion) {
+      // If the RegionSuccessor has no associated successor, it will return to
+      // its parent operation.
+      return !successorRegion.getSuccessor();
+    };
+    // Introduce a copy for all region "results" that are returned to the parent
+    // operation. This is required since the parent's result value has been
+    // considered critical. Therefore, the algorithm assumes that a copy of a
+    // previously allocated buffer is returned by the operation (like in the
+    // case of a block argument).
+    introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(),
+                                       value, regionPredicate);
   }
 
   /// Introduces buffer copies for all terminators in the given regions. The
   /// regionPredicate is applied to every successor region in order to restrict
-  /// the copies to specific regions. Thereby, the operandProvider is invoked
-  /// for each matching region successor and determines the operand index that
-  /// requires a buffer copy.
-  template <typename TPredicate, typename TOperandProvider>
-  void
-  introduceCopiesForRegionSuccessors(RegionBranchOpInterface regionInterface,
-                                     MutableArrayRef<Region> regions,
-                                     const TPredicate &regionPredicate,
-                                     const TOperandProvider &operandProvider) {
-    // Create an empty attribute for each operand to comply with the
-    // `getSuccessorRegions` interface definition that requires a single
-    // attribute per operand.
-    SmallVector<Attribute, 2> operandAttributes(
-        regionInterface.getOperation()->getNumOperands());
+  /// the copies to specific regions.
+  template <typename TPredicate>
+  void introduceCopiesForRegionSuccessors(
+      RegionBranchOpInterface regionInterface, MutableArrayRef<Region> regions,
+      Value argValue, const TPredicate &regionPredicate) {
     for (Region &region : regions) {
       // Query the regionInterface to get all successor regions of the current
       // one.
       SmallVector<RegionSuccessor, 2> successorRegions;
-      regionInterface.getSuccessorRegions(region.getRegionNumber(),
-                                          operandAttributes, successorRegions);
+      getSuccessorRegions(regionInterface, region.getRegionNumber(),
+                          successorRegions);
       // Try to find a matching region successor.
       RegionSuccessor *regionSuccessor =
           llvm::find_if(successorRegions, regionPredicate);
@@ -498,7 +601,9 @@ class BufferPlacement {
         continue;
       // Get the operand index in the context of the current successor input
       // bindings.
-      auto operandIndex = operandProvider(*regionSuccessor);
+      size_t operandIndex =
+          llvm::find(regionSuccessor->getSuccessorInputs(), argValue)
+              .getIndex();
 
       // Iterate over all immediate terminator operations to introduce
       // new buffer allocations. Thereby, the appropriate terminator operand
@@ -518,6 +623,16 @@ class BufferPlacement {
   /// its content into the newly allocated buffer. The terminator operation is
   /// used to insert the alloc and copy operations at the right places.
   Value introduceBufferCopy(Value sourceValue, Operation *terminator) {
+    // Avoid multiple copies of the same source value. This can happen in the
+    // presence of loops when a branch acts as a backedge while also having
+    // another successor that returns to its parent operation. Note: that
+    // copying copied buffers can introduce memory leaks since the invariant of
+    // BufferPlacement assumes that a buffer will be only copied once into a
+    // temporary buffer. Hence, the construction of copy chains introduces
+    // additional allocations that are not tracked automatically by the
+    // algorithm.
+    if (copiedValues.contains(sourceValue))
+      return sourceValue;
     // Create a new alloc at the current location of the terminator.
     auto memRefType = sourceValue.getType().cast<MemRefType>();
     OpBuilder builder(terminator);
@@ -541,6 +656,8 @@ class BufferPlacement {
     // allocation to the new one.
     builder.create<linalg::CopyOp>(terminator->getLoc(), sourceValue, alloc);
 
+    // Remember the copy of original source value.
+    copiedValues.insert(alloc);
     return alloc;
   }
 
@@ -652,6 +769,9 @@ class BufferPlacement {
   /// Maps allocation nodes to their associated blocks.
   AllocEntryList allocs;
 
+  // Stores already copied allocations to avoid additional copies of copies.
+  ValueSetT copiedValues;
+
   /// The underlying liveness analysis to compute fine grained information
   /// about alloc and dealloc positions.
   Liveness liveness;
@@ -673,6 +793,14 @@ class BufferPlacement {
 struct BufferPlacementPass : BufferPlacementBase<BufferPlacementPass> {
 
   void runOnFunction() override {
+    // Ensure that there are supported loops only.
+    Backedges backedges(getFunction());
+    if (backedges.size()) {
+      getFunction().emitError(
+          "Structured control-flow loops are supported only.");
+      return;
+    }
+
     // Place all required alloc, copy and dealloc nodes.
     BufferPlacement placement(getFunction());
     placement.place();

diff  --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-placement.mlir
index e1ed2c4309c3..dc9ff44bf483 100644
--- a/mlir/test/Transforms/buffer-placement.mlir
+++ b/mlir/test/Transforms/buffer-placement.mlir
@@ -1125,3 +1125,295 @@ func @nestedRegionControlFlowAlloca(
 //      CHECK: %[[ALLOCA:.*]] = alloca(%arg0, %arg1)
 // CHECK-NEXT: scf.yield %[[ALLOC0]]
 //      CHECK: return %[[ALLOC1]]
+
+// -----
+
+// Test Case: structured control-flow loop using a nested alloc.
+// The alloc positions of %3 will not be changed, but the iteration argument
+// %iterBuf has to be freed before yielding %3 to avoid memory leaks.
+
+// -----
+
+// CHECK-LABEL: func @loop_alloc
+func @loop_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = alloc() : memref<2xf32>
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+//      CHECK: linalg.copy(%arg3, %[[ALLOC1]])
+//      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+//      CHECK:    cmpi
+//      CHECK:    dealloc %[[IALLOC]]
+//      CHECK:    %[[ALLOC3:.*]] = alloc()
+//      CHECK:    %[[ALLOC4:.*]] = alloc()
+//      CHECK:    linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+//      CHECK:    dealloc %[[ALLOC3]]
+//      CHECK:    scf.yield %[[ALLOC4]]
+//      CHECK: }
+//      CHECK: linalg.copy(%[[ALLOC2]], %arg4)
+// CHECK-NEXT: dealloc %[[ALLOC2]]
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation.
+// The loop yields buffers that have been defined outside of the loop and the
+// backeges only use the iteration arguments (or one of its aliases).
+// Therefore, we do not have to (and are not allowed to) free any buffers
+// that are passed via the backedges.
+
+// CHECK-LABEL: func @loop_nested_if_no_alloc
+func @loop_nested_if_no_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      scf.yield %0 : memref<2xf32>
+    } else {
+      scf.yield %iterBuf : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] =
+//      CHECK: %[[ALLOC2:.*]] = scf.if
+//      CHECK: scf.yield %[[ALLOC0]]
+//      CHECK: scf.yield %[[IALLOC]]
+//      CHECK: scf.yield %[[ALLOC2]]
+//      CHECK: linalg.copy(%[[ALLOC1]], %arg4)
+//      CHECK: dealloc %[[ALLOC0]]
+
+// -----
+
+// Test Case: structured control-flow loop with a nested if operation using
+// a deeply nested buffer allocation.
+// Since the innermost allocation happens in a divergent branch, we have to
+// introduce additional copies for the nested if operation. Since the loop's
+// yield operation "returns" %3, it will return a newly allocated buffer.
+// Therefore, we have to free the iteration argument %iterBuf before
+// "returning" %3.
+
+// CHECK-LABEL: func @loop_nested_if_alloc
+func @loop_nested_if_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>) -> memref<2xf32> {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = cmpi "eq", %i, %ub : index
+    %3 = scf.if %2 -> (memref<2xf32>) {
+      %4 = alloc() : memref<2xf32>
+      scf.yield %4 : memref<2xf32>
+    } else {
+      scf.yield %0 : memref<2xf32>
+    }
+    scf.yield %3 : memref<2xf32>
+  }
+  return %1 : memref<2xf32>
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+//      CHECK: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
+// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = %[[ALLOC1]]
+//      CHECK: dealloc %[[IALLOC]]
+//      CHECK: %[[ALLOC3:.*]] = scf.if
+
+//      CHECK: %[[ALLOC4:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC5:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]])
+// CHECK-NEXT: dealloc %[[ALLOC4]]
+// CHECK-NEXT: scf.yield %[[ALLOC5]]
+
+//      CHECK: %[[ALLOC6:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC6]])
+// CHECK-NEXT: scf.yield %[[ALLOC6]]
+
+//      CHECK: %[[ALLOC7:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC3:.*]], %[[ALLOC7]])
+// CHECK-NEXT: dealloc %[[ALLOC3]]
+// CHECK-NEXT: scf.yield %[[ALLOC7]]
+
+//      CHECK: dealloc %[[ALLOC0]]
+// CHECK-NEXT: return %[[ALLOC2]]
+
+// -----
+
+// Test Case: several nested structured control-flow loops with a deeply nested
+// buffer allocation inside an if operation.
+// Same behavior is an loop_nested_if_alloc: we have to insert deallocations
+// before each yield in all loops recursively.
+
+// CHECK-LABEL: func @loop_nested_alloc
+func @loop_nested_alloc(
+  %lb: index,
+  %ub: index,
+  %step: index,
+  %buf: memref<2xf32>,
+  %res: memref<2xf32>) {
+  %0 = alloc() : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    %2 = scf.for %i2 = %lb to %ub step %step
+      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      %3 = scf.for %i3 = %lb to %ub step %step
+        iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
+        %4 = alloc() : memref<2xf32>
+        %5 = cmpi "eq", %i, %ub : index
+        %6 = scf.if %5 -> (memref<2xf32>) {
+          %7 = alloc() : memref<2xf32>
+          scf.yield %7 : memref<2xf32>
+        } else {
+          scf.yield %iterBuf3 : memref<2xf32>
+        }
+        scf.yield %6 : memref<2xf32>
+      }
+      scf.yield %3 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
+  }
+  "linalg.copy"(%1, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+//      CHECK: %[[ALLOC0:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC0]]
+// CHECK-NEXT: %[[ALLOC1:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
+// CHECK-NEXT: %[[VAL_7:.*]] = scf.for {{.*}} iter_args(%[[IALLOC0:.*]] = %[[ALLOC1]])
+//      CHECK: %[[ALLOC2:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[IALLOC0]], %[[ALLOC2]])
+// CHECK-NEXT: dealloc %[[IALLOC0]]
+// CHECK-NEXT: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args(%[[IALLOC1:.*]] = %[[ALLOC2]])
+//      CHECK: %[[ALLOC5:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[IALLOC1]], %[[ALLOC5]])
+// CHECK-NEXT: dealloc %[[IALLOC1]]
+
+//      CHECK: %[[ALLOC6:.*]] = scf.for {{.*}} iter_args(%[[IALLOC2:.*]] = %[[ALLOC5]])
+//      CHECK: %[[ALLOC8:.*]] = alloc()
+// CHECK-NEXT: dealloc %[[ALLOC8]]
+//      CHECK: %[[ALLOC9:.*]] = scf.if
+
+//      CHECK: %[[ALLOC11:.*]] = alloc()
+// CHECK-NEXT: %[[ALLOC12:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC11]], %[[ALLOC12]])
+// CHECK-NEXT: dealloc %[[ALLOC11]]
+// CHECK-NEXT: scf.yield %[[ALLOC12]]
+
+//      CHECK: %[[ALLOC13:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[IALLOC2]], %[[ALLOC13]])
+// CHECK-NEXT: scf.yield %[[ALLOC13]]
+
+//      CHECK: dealloc %[[IALLOC2]]
+// CHECK-NEXT: %[[ALLOC10:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC9]], %[[ALLOC10]])
+// CHECK-NEXT: dealloc %[[ALLOC9]]
+// CHECK-NEXT: scf.yield %[[ALLOC10]]
+
+//      CHECK: %[[ALLOC7:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC6]], %[[ALLOC7]])
+// CHECK-NEXT: dealloc %[[ALLOC6]]
+// CHECK-NEXT: scf.yield %[[ALLOC7]]
+
+//      CHECK: %[[ALLOC4:.*]] = alloc()
+// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+// CHECK-NEXT: dealloc %[[ALLOC3]]
+// CHECK-NEXT: scf.yield %[[ALLOC4]]
+
+//      CHECK: linalg.copy(%[[VAL_7]], %arg4)
+// CHECK-NEXT: dealloc %[[VAL_7]]
+
+// -----
+
+// Test Case: explicit control-flow loop with a dynamically allocated buffer.
+// The BufferPlacement transformation should fail on this explicit
+// control-flow loop since they are not supported.
+
+// CHECK-LABEL: func @loop_dynalloc
+func @loop_dynalloc(
+  %arg0 : i32,
+  %arg1 : i32,
+  %arg2: memref<?xf32>,
+  %arg3: memref<?xf32>) {
+  %const0 = constant 0 : i32
+  br ^loopHeader(%const0, %arg2 : i32, memref<?xf32>)
+
+^loopHeader(%i : i32, %buff : memref<?xf32>):
+  %lessThan = cmpi "slt", %i, %arg1 : i32
+  cond_br %lessThan,
+    ^loopBody(%i, %buff : i32, memref<?xf32>),
+    ^exit(%buff : memref<?xf32>)
+
+^loopBody(%val : i32, %buff2: memref<?xf32>):
+  %const1 = constant 1 : i32
+  %inc = addi %val, %const1 : i32
+  %size = std.index_cast %inc : i32 to index
+  %alloc1 = alloc(%size) : memref<?xf32>
+  br ^loopHeader(%inc, %alloc1 : i32, memref<?xf32>)
+
+^exit(%buff3 : memref<?xf32>):
+  "linalg.copy"(%buff3, %arg3) : (memref<?xf32>, memref<?xf32>) -> ()
+  return
+}
+
+// expected-error at +1 {{Structured control-flow loops are supported only}}
+
+// -----
+
+// Test Case: explicit control-flow loop with a dynamically allocated buffer.
+// The BufferPlacement transformation should fail on this explicit
+// control-flow loop since they are not supported.
+
+// CHECK-LABEL: func @do_loop_alloc
+func @do_loop_alloc(
+  %arg0 : i32,
+  %arg1 : i32,
+  %arg2: memref<2xf32>,
+  %arg3: memref<2xf32>) {
+  %const0 = constant 0 : i32
+  br ^loopBody(%const0, %arg2 : i32, memref<2xf32>)
+
+^loopBody(%val : i32, %buff2: memref<2xf32>):
+  %const1 = constant 1 : i32
+  %inc = addi %val, %const1 : i32
+  %alloc1 = alloc() : memref<2xf32>
+  br ^loopHeader(%inc, %alloc1 : i32, memref<2xf32>)
+
+^loopHeader(%i : i32, %buff : memref<2xf32>):
+  %lessThan = cmpi "slt", %i, %arg1 : i32
+  cond_br %lessThan,
+    ^loopBody(%i, %buff : i32, memref<2xf32>),
+    ^exit(%buff : memref<2xf32>)
+
+^exit(%buff3 : memref<2xf32>):
+  "linalg.copy"(%buff3, %arg3) : (memref<2xf32>, memref<2xf32>) -> ()
+  return
+}
+
+// expected-error at +1 {{Structured control-flow loops are supported only}}