[Mlir-commits] [mlir] 465b9a4 - Revert "Revert "[mlir] Introduce CloneOp and adapt test cases in BufferDeallocation.""

Wed Mar 31 00:55:21 PDT 2021

Author: Alexander Belyaev
Date: 2021-03-31T09:49:09+02:00
New Revision: 465b9a4a3303727df1584ca52bdced964a34efe9

URL: https://github.com/llvm/llvm-project/commit/465b9a4a3303727df1584ca52bdced964a34efe9
DIFF: https://github.com/llvm/llvm-project/commit/465b9a4a3303727df1584ca52bdced964a34efe9.diff

LOG: Revert "Revert "[mlir] Introduce CloneOp and adapt test cases in BufferDeallocation.""

This reverts commit 883912abe669ef246ada0adc9cf1c9748b742400.

Added: 
    mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
    mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp

Modified: 
    mlir/docs/BufferDeallocationInternals.md
    mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
    mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
    mlir/include/mlir/Transforms/BufferUtils.h
    mlir/include/mlir/Transforms/Passes.h
    mlir/include/mlir/Transforms/Passes.td
    mlir/lib/Dialect/MemRef/CMakeLists.txt
    mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
    mlir/lib/Transforms/BufferDeallocation.cpp
    mlir/lib/Transforms/BufferUtils.cpp
    mlir/lib/Transforms/CMakeLists.txt
    mlir/test/Transforms/buffer-deallocation.mlir
    mlir/test/Transforms/canonicalize.mlir

Removed: 
    mlir/lib/Transforms/CopyRemoval.cpp
    mlir/test/Transforms/copy-removal.mlir


################################################################################
diff  --git a/mlir/docs/BufferDeallocationInternals.md b/mlir/docs/BufferDeallocationInternals.md
index dee37493512d8..7c731066d31e8 100644

--- a/mlir/docs/BufferDeallocationInternals.md
+++ b/mlir/docs/BufferDeallocationInternals.md
@@ -48,7 +48,7 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>) {
   partial_write(%0, %0)
   br ^bb3()
 ^bb3():
-  "linalg.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+  test.copy(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
   return
 }
 ```
@@ -133,11 +133,11 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
 ^bb2:
-  %0 = alloc() : memref<2xf32>  // aliases: %1
+  %0 = memref.alloc() : memref<2xf32>  // aliases: %1
   use(%0)
   br ^bb3(%0 : memref<2xf32>)
 ^bb3(%1: memref<2xf32>):  // %1 could be %0 or %arg1
-  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
   return
 }
 ```
@@ -149,7 +149,7 @@ of code:
 
 ```mlir
 func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
-  %0 = alloc() : memref<2xf32>  // moved to bb0
+  %0 = memref.alloc() : memref<2xf32>  // moved to bb0
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   br ^bb3(%arg1 : memref<2xf32>)
@@ -157,7 +157,7 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
    use(%0)
    br ^bb3(%0 : memref<2xf32>)
 ^bb3(%1: memref<2xf32>):
-  "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+  test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
   return
 }
 ```
@@ -179,17 +179,17 @@ func @condBranchDynamicType(
 ^bb1:
   br ^bb3(%arg1 : memref<?xf32>)
 ^bb2(%0: index):
-  %1 = alloc(%0) : memref<?xf32>   // cannot be moved upwards to the data
+  %1 = memref.alloc(%0) : memref<?xf32>   // cannot be moved upwards to the data
                                    // dependency to %0
   use(%1)
   br ^bb3(%1 : memref<?xf32>)
 ^bb3(%2: memref<?xf32>):
-  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  test.copy(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
   return
 }
 ```
 
-## Introduction of Copies
+## Introduction of Clones
 
 In order to guarantee that all allocated buffers are freed properly, we have to
 pay attention to the control flow and all potential aliases a buffer allocation
@@ -200,10 +200,10 @@ allocations have already been placed:
 
 ```mlir
 func @branch(%arg0: i1) {
-  %0 = alloc() : memref<2xf32>  // aliases: %2
+  %0 = memref.alloc() : memref<2xf32>  // aliases: %2
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
-  %1 = alloc() : memref<2xf32>  // resides here for demonstration purposes
+  %1 = memref.alloc() : memref<2xf32>  // resides here for demonstration purposes
                                 // aliases: %2
   br ^bb3(%1 : memref<2xf32>)
 ^bb2:
@@ -232,88 +232,31 @@ result:
 
 ```mlir
 func @branch(%arg0: i1) {
-  %0 = alloc() : memref<2xf32>
+  %0 = memref.alloc() : memref<2xf32>
   cond_br %arg0, ^bb1, ^bb2
 ^bb1:
-  %1 = alloc() : memref<2xf32>
-  %3 = alloc() : memref<2xf32>  // temp copy for %1
-  "linalg.copy"(%1, %3) : (memref<2xf32>, memref<2xf32>) -> ()
-  dealloc %1 : memref<2xf32> // %1 can be safely freed here
+  %1 = memref.alloc() : memref<2xf32>
+  %3 = memref.clone %1 : (memref<2xf32>) -> (memref<2xf32>)
+  memref.dealloc %1 : memref<2xf32> // %1 can be safely freed here
   br ^bb3(%3 : memref<2xf32>)
 ^bb2:
   use(%0)
-  %4 = alloc() : memref<2xf32>  // temp copy for %0
-  "linalg.copy"(%0, %4) : (memref<2xf32>, memref<2xf32>) -> ()
+  %4 = memref.clone %0 : (memref<2xf32>) -> (memref<2xf32>)
   br ^bb3(%4 : memref<2xf32>)
 ^bb3(%2: memref<2xf32>):
   …
-  dealloc %2 : memref<2xf32> // free temp buffer %2
-  dealloc %0 : memref<2xf32> // %0 can be safely freed here
+  memref.dealloc %2 : memref<2xf32> // free temp buffer %2
+  memref.dealloc %0 : memref<2xf32> // %0 can be safely freed here
   return
 }
 ```
 
 Note that a temporary buffer for %2 was introduced to free all allocations
 properly. Note further that the unnecessary allocation of %3 can be easily
-removed using one of the post-pass transformations.
-
-Reconsider the previously introduced sample demonstrating dynamically shaped
-types:
-
-```mlir
-func @condBranchDynamicType(
-  %arg0: i1,
-  %arg1: memref<?xf32>,
-  %arg2: memref<?xf32>,
-  %arg3: index) {
-  cond_br %arg0, ^bb1, ^bb2(%arg3: index)
-^bb1:
-  br ^bb3(%arg1 : memref<?xf32>)
-^bb2(%0: index):
-  %1 = alloc(%0) : memref<?xf32>  // aliases: %2
-  use(%1)
-  br ^bb3(%1 : memref<?xf32>)
-^bb3(%2: memref<?xf32>):
-  "linalg.copy"(%2, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
-  return
-}
-```
+removed using one of the post-pass transformations or the canonicalization
+pass.
 
-In the presence of DSTs, we have to parameterize the allocations with
-additional dimension information of the source buffers, we want to copy from.
-BufferDeallocation automatically introduces all required operations to extract
-dimension specifications and wires them with the associated allocations:
-
-```mlir
-func @condBranchDynamicType(
-  %arg0: i1,
-  %arg1: memref<?xf32>,
-  %arg2: memref<?xf32>,
-  %arg3: index) {
-  cond_br %arg0, ^bb1, ^bb2(%arg3 : index)
-^bb1:
-  %c0 = constant 0 : index
-  %0 = dim %arg1, %c0 : memref<?xf32>   // dimension operation to parameterize
-                                        // the following temp allocation
-  %1 = alloc(%0) : memref<?xf32>
-  "linalg.copy"(%arg1, %1) : (memref<?xf32>, memref<?xf32>) -> ()
-  br ^bb3(%1 : memref<?xf32>)
-^bb2(%2: index):
-  %3 = alloc(%2) : memref<?xf32>
-  use(%3)
-  %c0_0 = constant 0 : index
-  %4 = dim %3, %c0_0 : memref<?xf32>  // dimension operation to parameterize
-                                      // the following temp allocation
-  %5 = alloc(%4) : memref<?xf32>
-  "linalg.copy"(%3, %5) : (memref<?xf32>, memref<?xf32>) -> ()
-  dealloc %3 : memref<?xf32>  // %3 can be safely freed here
-  br ^bb3(%5 : memref<?xf32>)
-^bb3(%6: memref<?xf32>):
-  "linalg.copy"(%6, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
-  dealloc %6 : memref<?xf32>  // %6 can be safely freed here
-  return
-}
-```
+The presented example also works with dynamically shaped types.
 
 BufferDeallocation performs a fix-point iteration taking all aliases of all
 tracked allocations into account. We initialize the general iteration process
@@ -335,7 +278,7 @@ func @condBranchDynamicTypeNested(
 ^bb1:
   br ^bb6(%arg1 : memref<?xf32>)
 ^bb2(%0: index):
-  %1 = alloc(%0) : memref<?xf32>   // cannot be moved upwards due to the data
+  %1 = memref.alloc(%0) : memref<?xf32>   // cannot be moved upwards due to the data
                                    // dependency to %0
                                    // aliases: %2, %3, %4
   use(%1)
@@ -349,7 +292,7 @@ func @condBranchDynamicTypeNested(
 ^bb6(%3: memref<?xf32>):  // crit. alias of %arg1 and %2 (in other words %1)
   br ^bb7(%3 : memref<?xf32>)
 ^bb7(%4: memref<?xf32>):  // non-crit. alias of %3, since %3 dominates %4
-  "linalg.copy"(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  test.copy(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
   return
 }
 ```
@@ -366,13 +309,11 @@ func @condBranchDynamicTypeNested(
   %arg3: index) {
   cond_br %arg0, ^bb1, ^bb2(%arg3 : index)
 ^bb1:
-  %c0 = constant 0 : index
-  %d0 = dim %arg1, %c0 : memref<?xf32>
-  %5 = alloc(%d0) : memref<?xf32>  // temp buffer required due to alias %3
-  "linalg.copy"(%arg1, %5) : (memref<?xf32>, memref<?xf32>) -> ()
+  // temp buffer required due to alias %3
+  %5 = memref.clone %arg1 : (memref<?xf32>) -> (memref<?xf32>)
   br ^bb6(%5 : memref<?xf32>)
 ^bb2(%0: index):
-  %1 = alloc(%0) : memref<?xf32>
+  %1 = memref.alloc(%0) : memref<?xf32>
   use(%1)
   cond_br %arg0, ^bb3, ^bb4
 ^bb3:
@@ -380,17 +321,14 @@ func @condBranchDynamicTypeNested(
 ^bb4:
   br ^bb5(%1 : memref<?xf32>)
 ^bb5(%2: memref<?xf32>):
-  %c0_0 = constant 0 : index
-  %d1 = dim %2, %c0_0 : memref<?xf32>
-  %6 = alloc(%d1) : memref<?xf32>  // temp buffer required due to alias %3
-  "linalg.copy"(%1, %6) : (memref<?xf32>, memref<?xf32>) -> ()
-  dealloc %1 : memref<?xf32>
+  %6 = memref.clone %1 : (memref<?xf32>) -> (memref<?xf32>)
+  memref.dealloc %1 : memref<?xf32>
   br ^bb6(%6 : memref<?xf32>)
 ^bb6(%3: memref<?xf32>):
   br ^bb7(%3 : memref<?xf32>)
 ^bb7(%4: memref<?xf32>):
-  "linalg.copy"(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
-  dealloc %3 : memref<?xf32>  // free %3, since %4 is a non-crit. alias of %3
+  test.copy(%4, %arg2) : (memref<?xf32>, memref<?xf32>) -> ()
+  memref.dealloc %3 : memref<?xf32>  // free %3, since %4 is a non-crit. alias of %3
   return
 }
 ```
@@ -399,7 +337,7 @@ Since %3 is a critical alias, BufferDeallocation introduces an additional
 temporary copy in all predecessor blocks. %3 has an additional (non-critical)
 alias %4 that extends the live range until the end of bb7. Therefore, we can
 free %3 after its last use, while taking all aliases into account. Note that %4
- does not need to be freed, since we did not introduce a copy for it.
+does not need to be freed, since we did not introduce a copy for it.
 
 The actual introduction of buffer copies is done after the fix-point iteration
 has been terminated and all critical aliases have been detected. A critical
@@ -445,7 +383,7 @@ infer the high-level control flow:
 func @inner_region_control_flow(
   %arg0 : index,
   %arg1 : index) -> memref<?x?xf32> {
-  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %0 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
   %1 = custom.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>)
    then(%arg2 : memref<?x?xf32>) {  // aliases: %arg4, %1
     custom.region_if_yield %arg2 : memref<?x?xf32>
@@ -468,11 +406,11 @@ operation to determine the value of %2 at runtime which creates an alias:
 ```mlir
 func @nested_region_control_flow(%arg0 : index, %arg1 : index) -> memref<?x?xf32> {
   %0 = cmpi "eq", %arg0, %arg1 : index
-  %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %1 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
   %2 = scf.if %0 -> (memref<?x?xf32>) {
     scf.yield %1 : memref<?x?xf32>   // %2 will be an alias of %1
   } else {
-    %3 = alloc(%arg0, %arg1) : memref<?x?xf32>  // nested allocation in a div.
+    %3 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>  // nested allocation in a div.
                                                 // branch
     use(%3)
     scf.yield %1 : memref<?x?xf32>   // %2 will be an alias of %1
@@ -489,13 +427,13 @@ alias of %1 which does not need to be tracked.
 ```mlir
 func @nested_region_control_flow(%arg0: index, %arg1: index) -> memref<?x?xf32> {
     %0 = cmpi "eq", %arg0, %arg1 : index
-    %1 = alloc(%arg0, %arg0) : memref<?x?xf32>
+    %1 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
     %2 = scf.if %0 -> (memref<?x?xf32>) {
       scf.yield %1 : memref<?x?xf32>
     } else {
-      %3 = alloc(%arg0, %arg1) : memref<?x?xf32>
+      %3 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
       use(%3)
-      dealloc %3 : memref<?x?xf32>  // %3 can be safely freed here
+      memref.dealloc %3 : memref<?x?xf32>  // %3 can be safely freed here
       scf.yield %1 : memref<?x?xf32>
     }
     return %2 : memref<?x?xf32>
@@ -514,12 +452,12 @@ above that uses a nested allocation:
 func @inner_region_control_flow_div(
   %arg0 : index,
   %arg1 : index) -> memref<?x?xf32> {
-  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %0 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
   %1 = custom.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>)
    then(%arg2 : memref<?x?xf32>) {  // aliases: %arg4, %1
     custom.region_if_yield %arg2 : memref<?x?xf32>
    } else(%arg3 : memref<?x?xf32>) {
-    %2 = alloc(%arg0, %arg1) : memref<?x?xf32>  // aliases: %arg4, %1
+    %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>  // aliases: %arg4, %1
     custom.region_if_yield %2 : memref<?x?xf32>
    } join(%arg4 : memref<?x?xf32>) {  // aliases: %1
     custom.region_if_yield %arg4 : memref<?x?xf32>
@@ -537,40 +475,22 @@ This causes BufferDeallocation to introduce additional copies:
 func @inner_region_control_flow_div(
   %arg0 : index,
   %arg1 : index) -> memref<?x?xf32> {
-  %0 = alloc(%arg0, %arg0) : memref<?x?xf32>
+  %0 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
   %1 = custom.region_if %0 : memref<?x?xf32> -> (memref<?x?xf32>)
    then(%arg2 : memref<?x?xf32>) {
-    %c0 = constant 0 : index  // determine dimension extents for temp allocation
-    %2 = dim %arg2, %c0 : memref<?x?xf32>
-    %c1 = constant 1 : index
-    %3 = dim %arg2, %c1 : memref<?x?xf32>
-    %4 = alloc(%2, %3) : memref<?x?xf32>  // temp buffer required due to critic.
-                                          // alias %arg4
-    linalg.copy(%arg2, %4) : memref<?x?xf32>, memref<?x?xf32>
+    %4 = memref.clone %arg2 : (memref<?x?xf32>) -> (memref<?x?xf32>)
     custom.region_if_yield %4 : memref<?x?xf32>
    } else(%arg3 : memref<?x?xf32>) {
-    %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
-    %c0 = constant 0 : index  // determine dimension extents for temp allocation
-    %3 = dim %2, %c0 : memref<?x?xf32>
-    %c1 = constant 1 : index
-    %4 = dim %2, %c1 : memref<?x?xf32>
-    %5 = alloc(%3, %4) : memref<?x?xf32>  // temp buffer required due to critic.
-                                          // alias %arg4
-    linalg.copy(%2, %5) : memref<?x?xf32>, memref<?x?xf32>
-    dealloc %2 : memref<?x?xf32>
+    %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
+    %5 = memref.clone %2 : (memref<?x?xf32>) -> (memref<?x?xf32>)
+    memref.dealloc %2 : memref<?x?xf32>
     custom.region_if_yield %5 : memref<?x?xf32>
    } join(%arg4: memref<?x?xf32>) {
-    %c0 = constant 0 : index  // determine dimension extents for temp allocation
-    %2 = dim %arg4, %c0 : memref<?x?xf32>
-    %c1 = constant 1 : index
-    %3 = dim %arg4, %c1 : memref<?x?xf32>
-    %4 = alloc(%2, %3) : memref<?x?xf32>  // this allocation will be removed by
-                                          // applying the copy removal pass
-    linalg.copy(%arg4, %4) : memref<?x?xf32>, memref<?x?xf32>
-    dealloc %arg4 : memref<?x?xf32>
+    %4 = memref.clone %arg4 : (memref<?x?xf32>) -> (memref<?x?xf32>)
+    memref.dealloc %arg4 : memref<?x?xf32>
     custom.region_if_yield %4 : memref<?x?xf32>
    }
-  dealloc %0 : memref<?x?xf32>  // %0 can be safely freed here
+  memref.dealloc %0 : memref<?x?xf32>  // %0 can be safely freed here
   return %1 : memref<?x?xf32>
 }
 ```
@@ -600,7 +520,7 @@ func @loop_nested_if(
     iter_args(%iterBuf = %buf) -> memref<2xf32> {
     %1 = cmpi "eq", %i, %ub : index
     %2 = scf.if %1 -> (memref<2xf32>) {
-      %3 = alloc() : memref<2xf32>  // makes %2 a critical alias due to a
+      %3 = memref.alloc() : memref<2xf32>  // makes %2 a critical alias due to a
                                     // divergent allocation
       use(%3)
       scf.yield %3 : memref<2xf32>
@@ -609,7 +529,7 @@ func @loop_nested_if(
     }
     scf.yield %2 : memref<2xf32>
   }
-  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
   return
 }
 ```
@@ -634,31 +554,27 @@ func @loop_nested_if(
   %step: index,
   %buf: memref<2xf32>,
   %res: memref<2xf32>) {
-  %4 = alloc() : memref<2xf32>
-  "linalg.copy"(%buf, %4) : (memref<2xf32>, memref<2xf32>) -> ()
+  %4 = memref.clone %buf : (memref<2xf32>) -> (memref<2xf32>)
   %0 = scf.for %i = %lb to %ub step %step
     iter_args(%iterBuf = %4) -> memref<2xf32> {
     %1 = cmpi "eq", %i, %ub : index
     %2 = scf.if %1 -> (memref<2xf32>) {
-      %3 = alloc() : memref<2xf32> // makes %2 a critical alias
+      %3 = memref.alloc() : memref<2xf32> // makes %2 a critical alias
       use(%3)
-      %5 = alloc() : memref<2xf32> // temp copy due to crit. alias %2
-      "linalg.copy"(%3, %5) : memref<2xf32>, memref<2xf32>
-      dealloc %3 : memref<2xf32>
+      %5 = memref.clone %3 : (memref<2xf32>) -> (memref<2xf32>)
+      memref.dealloc %3 : memref<2xf32>
       scf.yield %5 : memref<2xf32>
     } else {
-      %6 = alloc() : memref<2xf32> // temp copy due to crit. alias %2
-      "linalg.copy"(%iterBuf, %6) : memref<2xf32>, memref<2xf32>
+      %6 = memref.clone %iterBuf : (memref<2xf32>) -> (memref<2xf32>)
       scf.yield %6 : memref<2xf32>
     }
-    %7 = alloc() : memref<2xf32> // temp copy due to crit. alias %iterBuf
-    "linalg.copy"(%2, %7) : memref<2xf32>, memref<2xf32>
-    dealloc %2 : memref<2xf32>
-    dealloc %iterBuf : memref<2xf32> // free backedge iteration variable
+    %7 = memref.clone %2 : (memref<2xf32>) -> (memref<2xf32>)
+    memref.dealloc %2 : memref<2xf32>
+    memref.dealloc %iterBuf : memref<2xf32> // free backedge iteration variable
     scf.yield %7 : memref<2xf32>
   }
-  "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
-  dealloc %0 : memref<2xf32> // free temp copy %0
+  test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> ()
+  memref.dealloc %0 : memref<2xf32> // free temp copy %0
   return
 }
 ```
@@ -684,46 +600,37 @@ deallocations.
 
 In order to limit the complexity of the BufferDeallocation transformation, some
 tiny code-polishing/optimization transformations are not applied on-the-fly
-during placement. Currently, there is only the CopyRemoval transformation to
-remove unnecessary copy and allocation operations.
+during placement. Currently, a canonicalization pattern is added to the clone
+operation to reduce the appearance of unnecessary clones.
 
 Note: further transformations might be added to the post-pass phase in the
 future.
 
-## CopyRemoval Pass
-
-A common pattern that arises during placement is the introduction of
-unnecessary temporary copies that are used instead of the original source
-buffer. For this reason, there is a post-pass transformation that removes these
-allocations and copies via `-copy-removal`. This pass, besides removing
-unnecessary copy operations, will also remove the dead allocations and their
-corresponding deallocation operations. The CopyRemoval pass can currently be
-applied to operations that implement the `CopyOpInterface` in any of these two
-situations which are
+## Clone Canonicalization
 
-* reusing the source buffer of the copy operation.
-* reusing the target buffer of the copy operation.
+During placement of clones it may happen, that unnecessary clones are inserted.
+If these clones appear with their corresponding dealloc operation within the
+same block, we can use the canonicalizer to remove these unnecessary operations.
+Note, that this step needs to take place after the insertion of clones and
+deallocs in the buffer deallocation step. The canonicalization inludes both,
+the newly created target value from the clone operation and the source
+operation.
 
-## Reusing the Source Buffer of the Copy Operation
+## Canonicalization of the Source Buffer of the Clone Operation
 
-In this case, the source of the copy operation can be used instead of target.
-The unused allocation and deallocation operations that are defined for this
-copy operation are also removed. Here is a working example generated by the
-BufferDeallocation pass that allocates a buffer with dynamic size. A deeper
+In this case, the source of the clone operation can be used instead of its
+target. The unused allocation and deallocation operations that are defined for
+this clone operation are also removed. Here is a working example generated by
+the BufferDeallocation pass that allocates a buffer with dynamic size. A deeper
 analysis of this sample reveals that the highlighted operations are redundant
 and can be removed.
 
 ```mlir
 func @dynamic_allocation(%arg0: index, %arg1: index) -> memref<?x?xf32> {
-  %7 = alloc(%arg0, %arg1) : memref<?x?xf32>
-  %c0_0 = constant 0 : index
-  %8 = dim %7, %c0_0 : memref<?x?xf32>
-  %c1_1 = constant 1 : index
-  %9 = dim %7, %c1_1 : memref<?x?xf32>
-  %10 = alloc(%8, %9) : memref<?x?xf32>
-  linalg.copy(%7, %10) : memref<?x?xf32>, memref<?x?xf32>
-  dealloc %7 : memref<?x?xf32>
-  return %10 : memref<?x?xf32>
+  %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
+  %2 = memref.clone %1 : (memref<?x?xf32>) -> (memref<?x?xf32>)
+  memref.dealloc %1 : memref<?x?xf32>
+  return %2 : memref<?x?xf32>
 }
 ```
 
@@ -731,53 +638,39 @@ Will be transformed to:
 
 ```mlir
 func @dynamic_allocation(%arg0: index, %arg1: index) -> memref<?x?xf32> {
-  %7 = alloc(%arg0, %arg1) : memref<?x?xf32>
-  %c0_0 = constant 0 : index
-  %8 = dim %7, %c0_0 : memref<?x?xf32>
-  %c1_1 = constant 1 : index
-  %9 = dim %7, %c1_1 : memref<?x?xf32>
-  return %7 : memref<?x?xf32>
+  %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
+  return %1 : memref<?x?xf32>
 }
 ```
 
-In this case, the additional copy %10 can be replaced with its original source
-buffer %7. This also applies to the associated dealloc operation of %7.
+In this case, the additional copy %2 can be replaced with its original source
+buffer %1. This also applies to the associated dealloc operation of %1.
 
-To limit the complexity of this transformation, it only removes copy operations
-when the following constraints are met:
+## Canonicalization of the Target Buffer of the Clone Operation
 
-* The copy operation, the defining operation for the target value, and the
-deallocation of the source value lie in the same block.
-* There are no users/aliases of the target value between the defining operation
-of the target value and its copy operation.
-* There are no users/aliases of the source value between its associated copy
-operation and the deallocation of the source value.
+In this case, the target buffer of the clone operation can be used instead of
+its source. The unused deallocation operation that is defined for this clone
+operation is also removed.
 
-## Reusing the Target Buffer of the Copy Operation
-
-In this case, the target buffer of the copy operation can be used instead of
-its source. The unused allocation and deallocation operations that are defined
-for this copy operation are also removed.
-
-Consider the following example where a generic linalg operation writes the
-result to %temp and then copies %temp to %result. However, these two operations
-can be merged into a single step. Copy removal removes the copy operation and
-%temp, and replaces the uses of %temp with %result:
+Consider the following example where a generic test operation writes the result
+to %temp and then copies %temp to %result. However, these two operations
+can be merged into a single step. Canonicalization removes the clone operation
+and %temp, and replaces the uses of %temp with %result:
 
 ```mlir
 func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){
-  %temp = alloc() : memref<2xf32>
-  linalg.generic {
+  %temp = memref.alloc() : memref<2xf32>
+  test.generic {
     args_in = 1 : i64,
     args_out = 1 : i64,
     indexing_maps = [#map0, #map0],
     iterator_types = ["parallel"]} %arg0, %temp {
   ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
     %tmp2 = exp %gen2_arg0 : f32
-    linalg.yield %tmp2 : f32
+    test.yield %tmp2 : f32
   }: memref<2xf32>, memref<2xf32>
-  "linalg.copy"(%temp, %result) : (memref<2xf32>, memref<2xf32>) -> ()
-  dealloc %temp : memref<2xf32>
+  %result = memref.clone %temp : (memref<2xf32>) -> (memref<2xf32>)
+  memref.dealloc %temp : memref<2xf32>
   return
 }
 ```
@@ -786,33 +679,24 @@ Will be transformed to:
 
 ```mlir
 func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){
-  linalg.generic {
+  test.generic {
     args_in = 1 : i64,
     args_out = 1 : i64,
     indexing_maps = [#map0, #map0],
     iterator_types = ["parallel"]} %arg0, %result {
   ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
     %tmp2 = exp %gen2_arg0 : f32
-    linalg.yield %tmp2 : f32
+    test.yield %tmp2 : f32
   }: memref<2xf32>, memref<2xf32>
   return
 }
 ```
 
-Like before, several constraints to use the transformation apply:
-
-* The copy operation, the defining operation of the source value, and the
-deallocation of the source value lie in the same block.
-* There are no users/aliases of the target value between the defining operation
-of the source value and the copy operation.
-* There are no users/aliases of the source value between the copy operation and
-the deallocation of the source value.
-
 ## Known Limitations
 
-BufferDeallocation introduces additional copies using allocations from the
-“memref” dialect (“memref.alloc”). Analogous, all deallocations use the
-“memref” dialect-free operation “memref.dealloc”. The actual copy process is
-realized using “linalg.copy”. Furthermore, buffers are essentially immutable
-after their creation in a block. Another limitations are known in the case
-using unstructered control flow.
+BufferDeallocation introduces additional clones from “memref” dialect
+(“memref.clone”). Analogous, all deallocations use the “memref” dialect-free
+operation “memref.dealloc”. The actual copy process is realized using
+“test.copy”. Furthermore, buffers are essentially immutable after their
+creation in a block. Another limitations are known in the case using
+unstructered control flow.

diff  --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
index 9c2b912c0df15..0542423977835 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
@@ -12,6 +12,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/CastInterfaces.h"
+#include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 

diff  --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index b3f5257df782a..fe0fd7d0ff363 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -12,6 +12,7 @@
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
@@ -214,6 +215,9 @@ def MemRef_BufferCastOp : MemRef_Op<"buffer_cast",
     // Result type is tensor<4x?xf32>
     %12 = memref.buffer_cast %10 : memref<4x?xf32, #map0, 42>
     ```
+
+    Note, that mutating the result of the buffer cast operation leads to
+    undefined behavior.
   }];
 
   let arguments = (ins AnyTensor:$tensor);
@@ -312,6 +316,46 @@ def MemRef_CastOp : MemRef_Op<"cast", [
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// CloneOp
+//===----------------------------------------------------------------------===//
+
+def CloneOp : MemRef_Op<"clone", [
+    CopyOpInterface,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+  let builders = [
+    OpBuilder<(ins "Value":$value), [{
+      return build($_builder, $_state, value.getType(), value);
+    }]>];
+
+  let description = [{
+    Clones the data in the input view into an implicitly defined output view.
+
+    Usage:
+
+    ```mlir
+    %arg1 = memref.clone %arg0 : memref<?xf32> to memref<?xf32>
+    ```
+
+    Note, that mutating the source or result of the clone operation leads to
+    undefined behavior.
+  }];
+
+  let arguments = (ins Arg<AnyMemRef, "", []>:$input);
+  let results = (outs Arg<AnyMemRef, "", []>:$output);
+
+  let extraClassDeclaration = [{
+    Value getSource() { return input();}
+    Value getTarget() { return output(); }
+  }];
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+
+  let hasFolder = 1;
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // DeallocOp
 //===----------------------------------------------------------------------===//
@@ -1090,6 +1134,9 @@ def TensorLoadOp : MemRef_Op<"tensor_load",
     // Produces a value of tensor<4x?xf32> type.
     %12 = memref.tensor_load %10 : memref<4x?xf32, #layout, memspace0>
     ```
+
+    If tensor load is used in the bufferization steps, mutating the source
+    buffer after loading leads to undefined behavior.
   }];
 
   let arguments = (ins Arg<AnyRankedOrUnrankedMemRef,

diff  --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
new file mode 100644
index 0000000000000..024fe5ebfbc35
--- /dev/null
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -0,0 +1,29 @@
+//===- MemRefUtils.h - MemRef transformation utilities ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various transformation utilities for
+// the MemRefOps dialect. These are not passes by themselves but are used
+// either by passes, optimization sequences, or in turn by other transformation
+// utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_MEMREF_UTILS_MEMREFUTILS_H
+#define MLIR_DIALECT_MEMREF_UTILS_MEMREFUTILS_H
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+
+namespace mlir {
+
+/// Finds the associated dealloc that can be linked to our allocation nodes (if
+/// any).
+Operation *findDealloc(Value allocValue);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_MEMREF_UTILS_MEMREFUTILS_H

diff  --git a/mlir/include/mlir/Transforms/BufferUtils.h b/mlir/include/mlir/Transforms/BufferUtils.h
index 33edffa372a37..e432fb8f53f55 100644
--- a/mlir/include/mlir/Transforms/BufferUtils.h
+++ b/mlir/include/mlir/Transforms/BufferUtils.h
@@ -39,10 +39,6 @@ class BufferPlacementAllocs {
   static Operation *getStartOperation(Value allocValue, Block *placementBlock,
                                       const Liveness &liveness);
 
-  /// Find an associated dealloc operation that is linked to the given
-  /// allocation node (if any).
-  static Operation *findDealloc(Value allocValue);
-
 public:
   /// Initializes the internal list by discovering all supported allocation
   /// nodes.

diff  --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 60ea4b188ae16..1d4234b38efca 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -63,9 +63,6 @@ std::unique_ptr<Pass> createBufferResultsToOutParamsPass();
 /// Creates an instance of the Canonicalizer pass.
 std::unique_ptr<Pass> createCanonicalizerPass();
 
-/// Create a pass that removes unnecessary Copy operations.
-std::unique_ptr<Pass> createCopyRemovalPass();
-
 /// Creates a pass to perform common sub expression elimination.
 std::unique_ptr<Pass> createCSEPass();
 

diff  --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 2305c4a391912..0e14dcb873e73 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -282,8 +282,6 @@ def BufferDeallocation : FunctionPass<"buffer-deallocation"> {
 
   }];
   let constructor = "mlir::createBufferDeallocationPass()";
-  // TODO: this pass likely shouldn't depend on Linalg?
-  let dependentDialects = ["linalg::LinalgDialect"];
 }
 
 def BufferHoisting : FunctionPass<"buffer-hoisting"> {
@@ -366,11 +364,6 @@ def Canonicalizer : Pass<"canonicalize"> {
   let dependentDialects = ["memref::MemRefDialect"];
 }
 
-def CopyRemoval : FunctionPass<"copy-removal"> {
-  let summary = "Remove the redundant copies from input IR";
-  let constructor = "mlir::createCopyRemovalPass()";
-}
-
 def CSE : Pass<"cse"> {
   let summary = "Eliminate common sub-expressions";
   let description = [{

diff  --git a/mlir/lib/Dialect/MemRef/CMakeLists.txt b/mlir/lib/Dialect/MemRef/CMakeLists.txt
index f33061b2d87cf..dc79a5087f8ec 100644
--- a/mlir/lib/Dialect/MemRef/CMakeLists.txt
+++ b/mlir/lib/Dialect/MemRef/CMakeLists.txt
@@ -1 +1,23 @@
-add_subdirectory(IR)
+add_mlir_dialect_library(MLIRMemRef
+  IR/MemRefDialect.cpp
+  IR/MemRefOps.cpp
+  Utils/MemRefUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/inlude/mlir/Dialect/MemRefDialect
+
+  DEPENDS
+  MLIRStandardOpsIncGen
+  MLIRMemRefOpsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRDialect
+  MLIRInferTypeOpInterface
+  MLIRIR
+  MLIRStandard
+  MLIRTensor
+  MLIRViewLikeInterface
+)

diff  --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index f71ba8f39b61a..ffb5bcf59b7dc 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/StandardOps/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -463,6 +464,76 @@ OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
   return succeeded(foldMemRefCast(*this)) ? getResult() : Value();
 }
 
+//===----------------------------------------------------------------------===//
+// CloneOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(CloneOp op) { return success(); }
+
+void CloneOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get(), input(),
+                       SideEffects::DefaultResource::get());
+  effects.emplace_back(MemoryEffects::Write::get(), output(),
+                       SideEffects::DefaultResource::get());
+}
+
+namespace {
+/// Fold Dealloc operations that are deallocating an AllocOp that is only used
+/// by other Dealloc operations.
+struct SimplifyClones : public OpRewritePattern<CloneOp> {
+  using OpRewritePattern<CloneOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CloneOp cloneOp,
+                                PatternRewriter &rewriter) const override {
+    if (cloneOp.use_empty()) {
+      rewriter.eraseOp(cloneOp);
+      return success();
+    }
+
+    Value source = cloneOp.input();
+
+    // Removes the clone operation and the corresponding dealloc and alloc
+    // operation (if any).
+    auto tryRemoveClone = [&](Operation *sourceOp, Operation *dealloc,
+                              Operation *alloc) {
+      if (!sourceOp || !dealloc || !alloc ||
+          alloc->getBlock() != dealloc->getBlock())
+        return false;
+      rewriter.replaceOp(cloneOp, source);
+      rewriter.eraseOp(dealloc);
+      return true;
+    };
+
+    // Removes unnecessary clones that are derived from the result of the clone
+    // op.
+    Operation *deallocOp = findDealloc(cloneOp.output());
+    Operation *sourceOp = source.getDefiningOp();
+    if (tryRemoveClone(sourceOp, deallocOp, sourceOp))
+      return success();
+
+    // Removes unnecessary clones that are derived from the source of the clone
+    // op.
+    deallocOp = findDealloc(source);
+    if (tryRemoveClone(sourceOp, deallocOp, cloneOp))
+      return success();
+
+    return failure();
+  }
+};
+
+} // end anonymous namespace.
+
+void CloneOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<SimplifyClones>(context);
+}
+
+OpFoldResult CloneOp::fold(ArrayRef<Attribute> operands) {
+  return succeeded(foldMemRefCast(*this)) ? getResult() : Value();
+}
+
 //===----------------------------------------------------------------------===//
 // DeallocOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
new file mode 100644
index 0000000000000..26a9a217134e2
--- /dev/null
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -0,0 +1,35 @@
+//===- Utils.cpp - Utilities to support the MemRef dialect ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities for the MemRef dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+
+using namespace mlir;
+
+/// Finds associated deallocs that can be linked to our allocation nodes (if
+/// any).
+Operation *mlir::findDealloc(Value allocValue) {
+  auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) {
+    auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
+    if (!effectInterface)
+      return false;
+    // Try to find a free effect that is applied to one of our values
+    // that will be automatically freed by our pass.
+    SmallVector<MemoryEffects::EffectInstance, 2> effects;
+    effectInterface.getEffectsOnValue(allocValue, effects);
+    return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) {
+      return isa<MemoryEffects::Free>(it.getEffect());
+    });
+  });
+  // Assign the associated dealloc operation (if any).
+  return userIt != allocValue.user_end() ? *userIt : nullptr;
+}

diff  --git a/mlir/lib/Transforms/BufferDeallocation.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp
index aa837cb0e77c5..3ba744d8e6efb 100644
--- a/mlir/lib/Transforms/BufferDeallocation.cpp
+++ b/mlir/lib/Transforms/BufferDeallocation.cpp
@@ -7,16 +7,15 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements logic for computing correct alloc and dealloc positions.
-// Furthermore, buffer placement also adds required new alloc and copy
-// operations to ensure that all buffers are deallocated. The main class is the
+// Furthermore, buffer deallocation also adds required new clone operations to
+// ensure that all buffers are deallocated. The main class is the
 // BufferDeallocationPass class that implements the underlying algorithm. In
 // order to put allocations and deallocations at safe positions, it is
 // significantly important to put them into the correct blocks. However, the
 // liveness analysis does not pay attention to aliases, which can occur due to
 // branches (and their associated block arguments) in general. For this purpose,
 // BufferDeallocation firstly finds all possible aliases for a single value
-// (using the BufferAliasAnalysis class). Consider the following
-// example:
+// (using the BufferAliasAnalysis class). Consider the following example:
 //
 // ^bb0(%arg0):
 //   cond_br %cond, ^bb1, ^bb2
@@ -30,16 +29,16 @@
 //
 // We should place the dealloc for %new_value in exit. However, we have to free
 // the buffer in the same block, because it cannot be freed in the post
-// dominator. However, this requires a new copy buffer for %arg1 that will
+// dominator. However, this requires a new clone buffer for %arg1 that will
 // contain the actual contents. Using the class BufferAliasAnalysis, we
 // will find out that %new_value has a potential alias %arg1. In order to find
 // the dealloc position we have to find all potential aliases, iterate over
 // their uses and find the common post-dominator block (note that additional
-// copies and buffers remove potential aliases and will influence the placement
+// clones and buffers remove potential aliases and will influence the placement
 // of the deallocs). In all cases, the computed block can be safely used to free
 // the %new_value buffer (may be exit or bb2) as it will die and we can use
 // liveness information to determine the exact operation after which we have to
-// insert the dealloc. However, the algorithm supports introducing copy buffers
+// insert the dealloc. However, the algorithm supports introducing clone buffers
 // and placing deallocs in safe locations to ensure that all buffers will be
 // freed in the end.
 //
@@ -52,10 +51,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/StandardOps/Utils/Utils.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
@@ -187,25 +184,25 @@ class Backedges {
 
 /// The buffer deallocation transformation which ensures that all allocs in the
 /// program have a corresponding de-allocation. As a side-effect, it might also
-/// introduce copies that in turn leads to additional allocs and de-allocations.
+/// introduce clones that in turn leads to additional deallocations.
 class BufferDeallocation : BufferPlacementTransformationBase {
 public:
   BufferDeallocation(Operation *op)
       : BufferPlacementTransformationBase(op), dominators(op),
         postDominators(op) {}
 
-  /// Performs the actual placement/creation of all temporary alloc, copy and
-  /// dealloc nodes.
+  /// Performs the actual placement/creation of all temporary clone and dealloc
+  /// nodes.
   void deallocate() {
-    // Add additional allocations and copies that are required.
-    introduceCopies();
+    // Add additional clones that are required.
+    introduceClones();
     // Place deallocations for all allocation entries.
     placeDeallocs();
   }
 
 private:
-  /// Introduces required allocs and copy operations to avoid memory leaks.
-  void introduceCopies() {
+  /// Introduces required clone operations to avoid memory leaks.
+  void introduceClones() {
     // Initialize the set of values that require a dedicated memory free
     // operation since their operands cannot be safely deallocated in a post
     // dominator.
@@ -214,7 +211,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
     SmallVector<std::tuple<Value, Block *>, 8> toProcess;
 
     // Check dominance relation for proper dominance properties. If the given
-    // value node does not dominate an alias, we will have to create a copy in
+    // value node does not dominate an alias, we will have to create a clone in
     // order to free all buffers that can potentially leak into a post
     // dominator.
     auto findUnsafeValues = [&](Value source, Block *definingBlock) {
@@ -255,7 +252,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
     // arguments at the correct locations.
     aliases.remove(valuesToFree);
 
-    // Add new allocs and additional copy operations.
+    // Add new allocs and additional clone operations.
     for (Value value : valuesToFree) {
       if (auto blockArg = value.dyn_cast<BlockArgument>())
         introduceBlockArgCopy(blockArg);
@@ -269,7 +266,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
     }
   }
 
-  /// Introduces temporary allocs in all predecessors and copies the source
+  /// Introduces temporary clones in all predecessors and copies the source
   /// values into the newly allocated buffers.
   void introduceBlockArgCopy(BlockArgument blockArg) {
     // Allocate a buffer for the current block argument in the block of
@@ -285,9 +282,9 @@ class BufferDeallocation : BufferPlacementTransformationBase {
       Value sourceValue =
           branchInterface.getSuccessorOperands(it.getSuccessorIndex())
               .getValue()[blockArg.getArgNumber()];
-      // Create a new alloc and copy at the current location of the terminator.
-      Value alloc = introduceBufferCopy(sourceValue, terminator);
-      // Wire new alloc and successor operand.
+      // Create a new clone at the current location of the terminator.
+      Value clone = introduceCloneBuffers(sourceValue, terminator);
+      // Wire new clone and successor operand.
       auto mutableOperands =
           branchInterface.getMutableSuccessorOperands(it.getSuccessorIndex());
       if (!mutableOperands.hasValue())
@@ -296,7 +293,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
       else
         mutableOperands.getValue()
             .slice(blockArg.getArgNumber(), 1)
-            .assign(alloc);
+            .assign(clone);
     }
 
     // Check whether the block argument has implicitly defined predecessors via
@@ -310,7 +307,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
         !(regionInterface = dyn_cast<RegionBranchOpInterface>(parentOp)))
       return;
 
-    introduceCopiesForRegionSuccessors(
+    introduceClonesForRegionSuccessors(
         regionInterface, argRegion->getParentOp()->getRegions(), blockArg,
         [&](RegionSuccessor &successorRegion) {
           // Find a predecessor of our argRegion.
@@ -318,7 +315,7 @@ class BufferDeallocation : BufferPlacementTransformationBase {
         });
 
     // Check whether the block argument belongs to an entry region of the
-    // parent operation. In this case, we have to introduce an additional copy
+    // parent operation. In this case, we have to introduce an additional clone
     // for buffer that is passed to the argument.
     SmallVector<RegionSuccessor, 2> successorRegions;
     regionInterface.getSuccessorRegions(/*index=*/llvm::None, successorRegions);
@@ -329,20 +326,20 @@ class BufferDeallocation : BufferPlacementTransformationBase {
     if (it == successorRegions.end())
       return;
 
-    // Determine the actual operand to introduce a copy for and rewire the
-    // operand to point to the copy instead.
+    // Determine the actual operand to introduce a clone for and rewire the
+    // operand to point to the clone instead.
     Value operand =
         regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber())
             [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()];
-    Value copy = introduceBufferCopy(operand, parentOp);
+    Value clone = introduceCloneBuffers(operand, parentOp);
 
     auto op = llvm::find(parentOp->getOperands(), operand);
     assert(op != parentOp->getOperands().end() &&
            "parentOp does not contain operand");
-    parentOp->setOperand(op.getIndex(), copy);
+    parentOp->setOperand(op.getIndex(), clone);
   }
 
-  /// Introduces temporary allocs in front of all associated nested-region
+  /// Introduces temporary clones in front of all associated nested-region
   /// terminators and copies the source values into the newly allocated buffers.
   void introduceValueCopyForRegionResult(Value value) {
     // Get the actual result index in the scope of the parent terminator.
@@ -354,20 +351,20 @@ class BufferDeallocation : BufferPlacementTransformationBase {
       // its parent operation.
       return !successorRegion.getSuccessor();
     };
-    // Introduce a copy for all region "results" that are returned to the parent
-    // operation. This is required since the parent's result value has been
-    // considered critical. Therefore, the algorithm assumes that a copy of a
-    // previously allocated buffer is returned by the operation (like in the
-    // case of a block argument).
-    introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(),
+    // Introduce a clone for all region "results" that are returned to the
+    // parent operation. This is required since the parent's result value has
+    // been considered critical. Therefore, the algorithm assumes that a clone
+    // of a previously allocated buffer is returned by the operation (like in
+    // the case of a block argument).
+    introduceClonesForRegionSuccessors(regionInterface, operation->getRegions(),
                                        value, regionPredicate);
   }
 
-  /// Introduces buffer copies for all terminators in the given regions. The
+  /// Introduces buffer clones for all terminators in the given regions. The
   /// regionPredicate is applied to every successor region in order to restrict
-  /// the copies to specific regions.
+  /// the clones to specific regions.
   template <typename TPredicate>
-  void introduceCopiesForRegionSuccessors(
+  void introduceClonesForRegionSuccessors(
       RegionBranchOpInterface regionInterface, MutableArrayRef<Region> regions,
       Value argValue, const TPredicate &regionPredicate) {
     for (Region &region : regions) {
@@ -393,49 +390,37 @@ class BufferDeallocation : BufferPlacementTransformationBase {
       walkReturnOperations(&region, [&](Operation *terminator) {
         // Extract the source value from the current terminator.
         Value sourceValue = terminator->getOperand(operandIndex);
-        // Create a new alloc at the current location of the terminator.
-        Value alloc = introduceBufferCopy(sourceValue, terminator);
-        // Wire alloc and terminator operand.
-        terminator->setOperand(operandIndex, alloc);
+        // Create a new clone at the current location of the terminator.
+        Value clone = introduceCloneBuffers(sourceValue, terminator);
+        // Wire clone and terminator operand.
+        terminator->setOperand(operandIndex, clone);
       });
     }
   }
 
-  /// Creates a new memory allocation for the given source value and copies
+  /// Creates a new memory allocation for the given source value and clones
   /// its content into the newly allocated buffer. The terminator operation is
-  /// used to insert the alloc and copy operations at the right places.
-  Value introduceBufferCopy(Value sourceValue, Operation *terminator) {
-    // Avoid multiple copies of the same source value. This can happen in the
+  /// used to insert the clone operation at the right place.
+  Value introduceCloneBuffers(Value sourceValue, Operation *terminator) {
+    // Avoid multiple clones of the same source value. This can happen in the
     // presence of loops when a branch acts as a backedge while also having
     // another successor that returns to its parent operation. Note: that
     // copying copied buffers can introduce memory leaks since the invariant of
-    // BufferPlacement assumes that a buffer will be only copied once into a
-    // temporary buffer. Hence, the construction of copy chains introduces
+    // BufferDeallocation assumes that a buffer will be only cloned once into a
+    // temporary buffer. Hence, the construction of clone chains introduces
     // additional allocations that are not tracked automatically by the
     // algorithm.
-    if (copiedValues.contains(sourceValue))
+    if (clonedValues.contains(sourceValue))
       return sourceValue;
-    // Create a new alloc at the current location of the terminator.
-    auto memRefType = sourceValue.getType().cast<MemRefType>();
+    // Create a new clone operation that copies the contents of the old
+    // buffer to the new one.
     OpBuilder builder(terminator);
+    auto cloneOp =
+        builder.create<memref::CloneOp>(terminator->getLoc(), sourceValue);
 
-    // Extract information about dynamically shaped types by
-    // extracting their dynamic dimensions.
-    auto dynamicOperands =
-        getDynOperands(terminator->getLoc(), sourceValue, builder);
-
-    // TODO: provide a generic interface to create dialect-specific
-    // Alloc and CopyOp nodes.
-    auto alloc = builder.create<memref::AllocOp>(terminator->getLoc(),
-                                                 memRefType, dynamicOperands);
-
-    // Create a new copy operation that copies to contents of the old
-    // allocation to the new one.
-    builder.create<linalg::CopyOp>(terminator->getLoc(), sourceValue, alloc);
-
-    // Remember the copy of original source value.
-    copiedValues.insert(alloc);
-    return alloc;
+    // Remember the clone of original source value.
+    clonedValues.insert(cloneOp);
+    return cloneOp;
   }
 
   /// Finds correct dealloc positions according to the algorithm described at
@@ -513,8 +498,8 @@ class BufferDeallocation : BufferPlacementTransformationBase {
   /// position.
   PostDominanceInfo postDominators;
 
-  /// Stores already copied allocations to avoid additional copies of copies.
-  ValueSetT copiedValues;
+  /// Stores already cloned buffers to avoid additional clones of clones.
+  ValueSetT clonedValues;
 };
 
 //===----------------------------------------------------------------------===//
@@ -522,8 +507,8 @@ class BufferDeallocation : BufferPlacementTransformationBase {
 //===----------------------------------------------------------------------===//
 
 /// The actual buffer deallocation pass that inserts and moves dealloc nodes
-/// into the right positions. Furthermore, it inserts additional allocs and
-/// copies if necessary. It uses the algorithm described at the top of the file.
+/// into the right positions. Furthermore, it inserts additional clones if
+/// necessary. It uses the algorithm described at the top of the file.
 struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
 
   void runOnFunction() override {
@@ -540,7 +525,7 @@ struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
       return signalPassFailure();
     }
 
-    // Place all required temporary alloc, copy and dealloc nodes.
+    // Place all required temporary clone and dealloc nodes.
     BufferDeallocation deallocation(getFunction());
     deallocation.deallocate();
   }

diff  --git a/mlir/lib/Transforms/BufferUtils.cpp b/mlir/lib/Transforms/BufferUtils.cpp
index ab39f57b3fcc3..0cefd53d2d347 100644
--- a/mlir/lib/Transforms/BufferUtils.cpp
+++ b/mlir/lib/Transforms/BufferUtils.cpp
@@ -12,7 +12,7 @@
 
 #include "mlir/Transforms/BufferUtils.h"
 #include "PassDetail.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
@@ -49,25 +49,6 @@ Operation *BufferPlacementAllocs::getStartOperation(Value allocValue,
   return startOperation;
 }
 
-/// Finds associated deallocs that can be linked to our allocation nodes (if
-/// any).
-Operation *BufferPlacementAllocs::findDealloc(Value allocValue) {
-  auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) {
-    auto effectInterface = dyn_cast<MemoryEffectOpInterface>(user);
-    if (!effectInterface)
-      return false;
-    // Try to find a free effect that is applied to one of our values
-    // that will be automatically freed by our pass.
-    SmallVector<MemoryEffects::EffectInstance, 2> effects;
-    effectInterface.getEffectsOnValue(allocValue, effects);
-    return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) {
-      return isa<MemoryEffects::Free>(it.getEffect());
-    });
-  });
-  // Assign the associated dealloc operation (if any).
-  return userIt != allocValue.user_end() ? *userIt : nullptr;
-}
-
 /// Initializes the internal list by discovering all supported allocation
 /// nodes.
 BufferPlacementAllocs::BufferPlacementAllocs(Operation *op) { build(op); }

diff  --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 36f9e5b832be9..2b185fcf0b7ee 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -7,7 +7,6 @@ add_mlir_library(MLIRTransforms
   BufferUtils.cpp
   Bufferize.cpp
   Canonicalizer.cpp
-  CopyRemoval.cpp
   CSE.cpp
   Inliner.cpp
   LocationSnapshot.cpp

diff  --git a/mlir/lib/Transforms/CopyRemoval.cpp b/mlir/lib/Transforms/CopyRemoval.cpp
deleted file mode 100644
index c5a8da6329568..0000000000000
--- a/mlir/lib/Transforms/CopyRemoval.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-//===- CopyRemoval.cpp - Removing the redundant copies --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Interfaces/CopyOpInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/Passes.h"
-
-using namespace mlir;
-using namespace MemoryEffects;
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// CopyRemovalPass
-//===----------------------------------------------------------------------===//
-
-/// This pass removes the redundant Copy operations. Additionally, it
-/// removes the leftover definition and deallocation operations by erasing the
-/// copy operation.
-class CopyRemovalPass : public PassWrapper<CopyRemovalPass, OperationPass<>> {
-public:
-  void runOnOperation() override {
-    getOperation()->walk([&](CopyOpInterface copyOp) {
-      reuseCopySourceAsTarget(copyOp);
-      reuseCopyTargetAsSource(copyOp);
-    });
-    for (std::pair<Value, Value> &pair : replaceList)
-      pair.first.replaceAllUsesWith(pair.second);
-    for (Operation *op : eraseList)
-      op->erase();
-  }
-
-private:
-  /// List of operations that need to be removed.
-  llvm::SmallPtrSet<Operation *, 4> eraseList;
-
-  /// List of values that need to be replaced with their counterparts.
-  llvm::SmallDenseSet<std::pair<Value, Value>, 4> replaceList;
-
-  /// Returns the allocation operation for `value` in `block` if it exists.
-  /// nullptr otherwise.
-  Operation *getAllocationOpInBlock(Value value, Block *block) {
-    assert(block && "Block cannot be null");
-    Operation *op = value.getDefiningOp();
-    if (op && op->getBlock() == block) {
-      auto effects = dyn_cast<MemoryEffectOpInterface>(op);
-      if (effects && effects.hasEffect<Allocate>())
-        return op;
-    }
-    return nullptr;
-  }
-
-  /// Returns the deallocation operation for `value` in `block` if it exists.
-  /// nullptr otherwise.
-  Operation *getDeallocationOpInBlock(Value value, Block *block) {
-    assert(block && "Block cannot be null");
-    auto valueUsers = value.getUsers();
-    auto it = llvm::find_if(valueUsers, [&](Operation *op) {
-      auto effects = dyn_cast<MemoryEffectOpInterface>(op);
-      return effects && op->getBlock() == block && effects.hasEffect<Free>();
-    });
-    return (it == valueUsers.end() ? nullptr : *it);
-  }
-
-  /// Returns true if an operation between start and end operations has memory
-  /// effect.
-  bool hasMemoryEffectOpBetween(Operation *start, Operation *end) {
-    assert((start || end) && "Start and end operations cannot be null");
-    assert(start->getBlock() == end->getBlock() &&
-           "Start and end operations should be in the same block.");
-    Operation *op = start->getNextNode();
-    while (op->isBeforeInBlock(end)) {
-      if (isa<MemoryEffectOpInterface>(op))
-        return true;
-      op = op->getNextNode();
-    }
-    return false;
-  };
-
-  /// Returns true if `val` value has at least a user between `start` and
-  /// `end` operations.
-  bool hasUsersBetween(Value val, Operation *start, Operation *end) {
-    assert((start || end) && "Start and end operations cannot be null");
-    Block *block = start->getBlock();
-    assert(block == end->getBlock() &&
-           "Start and end operations should be in the same block.");
-    return llvm::any_of(val.getUsers(), [&](Operation *op) {
-      return op->getBlock() == block && start->isBeforeInBlock(op) &&
-             op->isBeforeInBlock(end);
-    });
-  };
-
-  bool areOpsInTheSameBlock(ArrayRef<Operation *> operations) {
-    assert(!operations.empty() &&
-           "The operations list should contain at least a single operation");
-    Block *block = operations.front()->getBlock();
-    return llvm::none_of(
-        operations, [&](Operation *op) { return block != op->getBlock(); });
-  }
-
-  /// Input:
-  /// func(){
-  ///   %from = alloc()
-  ///   write_to(%from)
-  ///   %to = alloc()
-  ///   copy(%from,%to)
-  ///   dealloc(%from)
-  ///   return %to
-  /// }
-  ///
-  /// Output:
-  /// func(){
-  ///   %from = alloc()
-  ///   write_to(%from)
-  ///   return %from
-  /// }
-  /// Constraints:
-  /// 1) %to, copy and dealloc must all be defined and lie in the same block.
-  /// 2) This transformation cannot be applied if there is a single user/alias
-  /// of `to` value between the defining operation of `to` and the copy
-  /// operation.
-  /// 3) This transformation cannot be applied if there is a single user/alias
-  /// of `from` value between the copy operation and the deallocation of `from`.
-  /// TODO: Alias analysis is not available at the moment. Currently, we check
-  /// if there are any operations with memory effects between copy and
-  /// deallocation operations.
-  void reuseCopySourceAsTarget(CopyOpInterface copyOp) {
-    if (eraseList.count(copyOp))
-      return;
-
-    Value from = copyOp.getSource();
-    Value to = copyOp.getTarget();
-
-    Operation *copy = copyOp.getOperation();
-    Block *copyBlock = copy->getBlock();
-    Operation *fromDefiningOp = from.getDefiningOp();
-    Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock);
-    Operation *toDefiningOp = getAllocationOpInBlock(to, copyBlock);
-    if (!fromDefiningOp || !fromFreeingOp || !toDefiningOp ||
-        !areOpsInTheSameBlock({fromFreeingOp, toDefiningOp, copy}) ||
-        hasUsersBetween(to, toDefiningOp, copy) ||
-        hasUsersBetween(from, copy, fromFreeingOp) ||
-        hasMemoryEffectOpBetween(copy, fromFreeingOp))
-      return;
-
-    replaceList.insert({to, from});
-    eraseList.insert(copy);
-    eraseList.insert(toDefiningOp);
-    eraseList.insert(fromFreeingOp);
-  }
-
-  /// Input:
-  /// func(){
-  ///   %to = alloc()
-  ///   %from = alloc()
-  ///   write_to(%from)
-  ///   copy(%from,%to)
-  ///   dealloc(%from)
-  ///   return %to
-  /// }
-  ///
-  /// Output:
-  /// func(){
-  ///   %to = alloc()
-  ///   write_to(%to)
-  ///   return %to
-  /// }
-  /// Constraints:
-  /// 1) %from, copy and dealloc must all be defined and lie in the same block.
-  /// 2) This transformation cannot be applied if there is a single user/alias
-  /// of `to` value between the defining operation of `from` and the copy
-  /// operation.
-  /// 3) This transformation cannot be applied if there is a single user/alias
-  /// of `from` value between the copy operation and the deallocation of `from`.
-  /// TODO: Alias analysis is not available at the moment. Currently, we check
-  /// if there are any operations with memory effects between copy and
-  /// deallocation operations.
-  void reuseCopyTargetAsSource(CopyOpInterface copyOp) {
-    if (eraseList.count(copyOp))
-      return;
-
-    Value from = copyOp.getSource();
-    Value to = copyOp.getTarget();
-
-    Operation *copy = copyOp.getOperation();
-    Block *copyBlock = copy->getBlock();
-    Operation *fromDefiningOp = getAllocationOpInBlock(from, copyBlock);
-    Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock);
-    if (!fromDefiningOp || !fromFreeingOp ||
-        !areOpsInTheSameBlock({fromFreeingOp, fromDefiningOp, copy}) ||
-        hasUsersBetween(to, fromDefiningOp, copy) ||
-        hasUsersBetween(from, copy, fromFreeingOp) ||
-        hasMemoryEffectOpBetween(copy, fromFreeingOp))
-      return;
-
-    replaceList.insert({from, to});
-    eraseList.insert(copy);
-    eraseList.insert(fromDefiningOp);
-    eraseList.insert(fromFreeingOp);
-  }
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// CopyRemovalPass construction
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<Pass> mlir::createCopyRemovalPass() {
-  return std::make_unique<CopyRemovalPass>();
-}

diff  --git a/mlir/test/Transforms/buffer-deallocation.mlir b/mlir/test/Transforms/buffer-deallocation.mlir
index 25197d14fba77..35f7bbf79c8f5 100644
--- a/mlir/test/Transforms/buffer-deallocation.mlir
+++ b/mlir/test/Transforms/buffer-deallocation.mlir
@@ -30,13 +30,11 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
 }
 
 // CHECK-NEXT: cond_br
-//      CHECK: %[[ALLOC0:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+//      CHECK: %[[ALLOC0:.*]] = memref.clone
 // CHECK-NEXT: br ^bb3(%[[ALLOC0]]
-//      CHECK: %[[ALLOC1:.*]] = memref.alloc()
+//      CHECK: %[[ALLOC1:.*]] = memref.alloc
 // CHECK-NEXT: test.buffer_based
-//      CHECK: %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC1]]
 // CHECK-NEXT: br ^bb3(%[[ALLOC2]]
 //      CHECK: test.copy
@@ -77,16 +75,12 @@ func @condBranchDynamicType(
 }
 
 // CHECK-NEXT: cond_br
-//      CHECK: %[[DIM0:.*]] = memref.dim
-// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc(%[[DIM0]])
-// CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
+//      CHECK: %[[ALLOC0:.*]] = memref.clone
 // CHECK-NEXT: br ^bb3(%[[ALLOC0]]
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]])
 // CHECK-NEXT: test.buffer_based
-//      CHECK: %[[DIM1:.*]] = memref.dim %[[ALLOC1]]
-// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc(%[[DIM1]])
-// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[ALLOC2]])
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone
 // CHECK-NEXT: memref.dealloc %[[ALLOC1]]
 // CHECK-NEXT: br ^bb3
 // CHECK-NEXT: ^bb3(%[[ALLOC3:.*]]:{{.*}})
@@ -142,12 +136,10 @@ func @condBranchDynamicTypeNested(
   return
 }
 
-// CHECK-NEXT: cond_br
-//      CHECK: ^bb1
-//      CHECK: %[[DIM0:.*]] = memref.dim
-// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc(%[[DIM0]])
-// CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]])
-// CHECK-NEXT: br ^bb6
+// CHECK-NEXT: cond_br{{.*}}
+// CHECK-NEXT: ^bb1
+// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone
+// CHECK-NEXT: br ^bb6(%[[ALLOC0]]
 //      CHECK: ^bb2(%[[IDX:.*]]:{{.*}})
 // CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]])
 // CHECK-NEXT: test.buffer_based
@@ -157,9 +149,7 @@ func @condBranchDynamicTypeNested(
 //      CHECK: ^bb4:
 // CHECK-NEXT: br ^bb5(%[[ALLOC1]]{{.*}})
 // CHECK-NEXT: ^bb5(%[[ALLOC2:.*]]:{{.*}})
-//      CHECK: %[[DIM2:.*]] = memref.dim %[[ALLOC2]]
-// CHECK-NEXT: %[[ALLOC3:.*]] = memref.alloc(%[[DIM2]])
-// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC3]])
+// CHECK-NEXT: %[[ALLOC3:.*]] = memref.clone %[[ALLOC2]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC1]]
 // CHECK-NEXT: br ^bb6(%[[ALLOC3]]{{.*}})
 // CHECK-NEXT: ^bb6(%[[ALLOC4:.*]]:{{.*}})
@@ -208,13 +198,11 @@ func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   return
 }
 
-// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone
 // CHECK-NEXT: cond_br
 //      CHECK: %[[ALLOC1:.*]] = memref.alloc()
 // CHECK-NEXT: test.buffer_based
-//      CHECK: %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC1]]
 //      CHECK: test.copy
 // CHECK-NEXT: memref.dealloc
@@ -419,20 +407,17 @@ func @moving_alloc_and_inserting_missing_dealloc(
   return
 }
 
-// CHECK-NEXT: cond_br
-//      CHECK: ^bb1
-//      CHECK: ^bb1
+// CHECK-NEXT: cond_br{{.*}}
+// CHECK-NEXT: ^bb1
 //      CHECK: %[[ALLOC0:.*]] = memref.alloc()
 // CHECK-NEXT: test.buffer_based
-//      CHECK: %[[ALLOC1:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %[[ALLOC0]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC0]]
 // CHECK-NEXT: br ^bb3(%[[ALLOC1]]
 // CHECK-NEXT: ^bb2
 // CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc()
 // CHECK-NEXT: test.buffer_based
-//      CHECK: %[[ALLOC3:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+// CHECK-NEXT: %[[ALLOC3:.*]] = memref.clone %[[ALLOC2]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC2]]
 // CHECK-NEXT: br ^bb3(%[[ALLOC3]]
 // CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}})
@@ -545,8 +530,7 @@ func @nested_regions_and_cond_branch(
 }
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
-//      CHECK:   %[[ALLOC0:.*]] = memref.alloc()
-// CHECK-NEXT:   linalg.copy(%[[ARG1]], %[[ALLOC0]])
+//      CHECK:   %[[ALLOC0:.*]] = memref.clone %[[ARG1]]
 //      CHECK: ^[[BB2]]:
 //      CHECK:   %[[ALLOC1:.*]] = memref.alloc()
 // CHECK-NEXT:   test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]]
@@ -554,12 +538,11 @@ func @nested_regions_and_cond_branch(
 // CHECK-NEXT:     test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC2]]
 //      CHECK:     memref.dealloc %[[ALLOC2]]
 // CHECK-NEXT:     %{{.*}} = math.exp
-//      CHECK:   %[[ALLOC3:.*]] = memref.alloc()
-// CHECK-NEXT:   linalg.copy(%[[ALLOC1]], %[[ALLOC3]])
+//      CHECK:   %[[ALLOC3:.*]] = memref.clone %[[ALLOC1]]
 // CHECK-NEXT:   memref.dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  test.copy
-// CHECK-NEXT:  dealloc
+// CHECK-NEXT:  memref.dealloc
 
 // -----
 
@@ -641,12 +624,10 @@ func @nested_region_control_flow_div(
 
 //      CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0)
 // CHECK-NEXT: %[[ALLOC1:.*]] = scf.if
-//      CHECK: %[[ALLOC2:.*]] = memref.alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC2]])
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC0]]
 //      CHECK: scf.yield %[[ALLOC2]]
 //      CHECK: %[[ALLOC3:.*]] = memref.alloc(%arg0, %arg1)
-//      CHECK: %[[ALLOC4:.*]] = memref.alloc
-// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+// CHECK-NEXT: %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]]
 //      CHECK: memref.dealloc %[[ALLOC3]]
 //      CHECK: scf.yield %[[ALLOC4]]
 //      CHECK: memref.dealloc %[[ALLOC0]]
@@ -823,20 +804,18 @@ func @nestedRegionsAndCondBranchAlloca(
 //      CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}})
 // CHECK-NEXT:   cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]]
 //      CHECK: ^[[BB1]]:
-//      CHECK: %[[ALLOC0:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy
+//      CHECK: %[[ALLOC0:.*]] = memref.clone
 //      CHECK: ^[[BB2]]:
 //      CHECK:   %[[ALLOC1:.*]] = memref.alloc()
 // CHECK-NEXT:   test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]]
 //      CHECK:     %[[ALLOCA:.*]] = memref.alloca()
 // CHECK-NEXT:     test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOCA]]
 //      CHECK:     %{{.*}} = math.exp
-//      CHECK:  %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT:  linalg.copy
+//      CHECK:  %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]]
 // CHECK-NEXT:  memref.dealloc %[[ALLOC1]]
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  test.copy
-// CHECK-NEXT:  dealloc
+// CHECK-NEXT:  memref.dealloc
 
 // -----
 
@@ -888,15 +867,13 @@ func @loop_alloc(
 
 //      CHECK: %[[ALLOC0:.*]] = memref.alloc()
 // CHECK-NEXT: memref.dealloc %[[ALLOC0]]
-// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc()
-//      CHECK: linalg.copy(%arg3, %[[ALLOC1]])
+// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3
 //      CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
 // CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK:    cmpi
 //      CHECK:    memref.dealloc %[[IALLOC]]
 //      CHECK:    %[[ALLOC3:.*]] = memref.alloc()
-//      CHECK:    %[[ALLOC4:.*]] = memref.alloc()
-//      CHECK:    linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+//      CHECK:    %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]]
 //      CHECK:    memref.dealloc %[[ALLOC3]]
 //      CHECK:    scf.yield %[[ALLOC4]]
 //      CHECK: }
@@ -974,25 +951,21 @@ func @loop_nested_if_alloc(
 }
 
 //      CHECK: %[[ALLOC0:.*]] = memref.alloc()
-//      CHECK: %[[ALLOC1:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
+// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3
 // CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args
 // CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]]
 //      CHECK: memref.dealloc %[[IALLOC]]
 //      CHECK: %[[ALLOC3:.*]] = scf.if
 
 //      CHECK: %[[ALLOC4:.*]] = memref.alloc()
-// CHECK-NEXT: %[[ALLOC5:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]])
+// CHECK-NEXT: %[[ALLOC5:.*]] = memref.clone %[[ALLOC4]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC4]]
 // CHECK-NEXT: scf.yield %[[ALLOC5]]
 
-//      CHECK: %[[ALLOC6:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC6]])
+//      CHECK: %[[ALLOC6:.*]] = memref.clone %[[ALLOC0]]
 // CHECK-NEXT: scf.yield %[[ALLOC6]]
 
-//      CHECK: %[[ALLOC7:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC3:.*]], %[[ALLOC7]])
+//      CHECK: %[[ALLOC7:.*]] = memref.clone %[[ALLOC3]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC3]]
 // CHECK-NEXT: scf.yield %[[ALLOC7]]
 
@@ -1040,17 +1013,14 @@ func @loop_nested_alloc(
 
 //      CHECK: %[[ALLOC0:.*]] = memref.alloc()
 // CHECK-NEXT: memref.dealloc %[[ALLOC0]]
-// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]])
+// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3
 // CHECK-NEXT: %[[VAL_7:.*]] = scf.for {{.*}} iter_args
 // CHECK-SAME: (%[[IALLOC0:.*]] = %[[ALLOC1]])
-//      CHECK: %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[IALLOC0]], %[[ALLOC2]])
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[IALLOC0]]
 // CHECK-NEXT: memref.dealloc %[[IALLOC0]]
 // CHECK-NEXT: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args
 // CHECK-SAME: (%[[IALLOC1:.*]] = %[[ALLOC2]])
-//      CHECK: %[[ALLOC5:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[IALLOC1]], %[[ALLOC5]])
+// CHECK-NEXT: %[[ALLOC5:.*]] = memref.clone %[[IALLOC1]]
 // CHECK-NEXT: memref.dealloc %[[IALLOC1]]
 
 //      CHECK: %[[ALLOC6:.*]] = scf.for {{.*}} iter_args
@@ -1060,28 +1030,23 @@ func @loop_nested_alloc(
 //      CHECK: %[[ALLOC9:.*]] = scf.if
 
 //      CHECK: %[[ALLOC11:.*]] = memref.alloc()
-// CHECK-NEXT: %[[ALLOC12:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC11]], %[[ALLOC12]])
+// CHECK-NEXT: %[[ALLOC12:.*]] = memref.clone %[[ALLOC11]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC11]]
 // CHECK-NEXT: scf.yield %[[ALLOC12]]
 
-//      CHECK: %[[ALLOC13:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[IALLOC2]], %[[ALLOC13]])
+//      CHECK: %[[ALLOC13:.*]] = memref.clone %[[IALLOC2]]
 // CHECK-NEXT: scf.yield %[[ALLOC13]]
 
 //      CHECK: memref.dealloc %[[IALLOC2]]
-// CHECK-NEXT: %[[ALLOC10:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC9]], %[[ALLOC10]])
+// CHECK-NEXT: %[[ALLOC10:.*]] = memref.clone %[[ALLOC9]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC9]]
 // CHECK-NEXT: scf.yield %[[ALLOC10]]
 
-//      CHECK: %[[ALLOC7:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC6]], %[[ALLOC7]])
+//      CHECK: %[[ALLOC7:.*]] = memref.clone %[[ALLOC6]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC6]]
 // CHECK-NEXT: scf.yield %[[ALLOC7]]
 
-//      CHECK: %[[ALLOC4:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]])
+//      CHECK: %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC3]]
 // CHECK-NEXT: scf.yield %[[ALLOC4]]
 
@@ -1183,8 +1148,7 @@ func @assumingOp(
 // CHECK-NEXT:    shape.assuming_yield %[[ARG1]]
 //      CHECK: %[[ASSUMING_RESULT:.*]] = shape.assuming %[[ARG0]]
 // CHECK-NEXT:    %[[TMP_ALLOC:.*]] = memref.alloc()
-// CHECK-NEXT:    %[[RETURNING_ALLOC:.*]] = memref.alloc()
-// CHECK-NEXT:    linalg.copy(%[[TMP_ALLOC]], %[[RETURNING_ALLOC]])
+// CHECK-NEXT:    %[[RETURNING_ALLOC:.*]] = memref.clone %[[TMP_ALLOC]]
 // CHECK-NEXT:    memref.dealloc %[[TMP_ALLOC]]
 // CHECK-NEXT:    shape.assuming_yield %[[RETURNING_ALLOC]]
 //      CHECK: test.copy(%[[ASSUMING_RESULT:.*]], %[[ARG2]])

diff  --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index e1869ac58f524..e54135f21b4bd 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -1120,3 +1120,87 @@ func @fold_trunci_sexti(%arg0: i1) -> i1 attributes {} {
   %1 = trunci %0 : i8 to i1
   return %1 : i1
 }
+
+// CHECK-LABEL: func @simple_clone_elimination
+func @simple_clone_elimination() -> memref<5xf32> {
+  %ret = memref.alloc() : memref<5xf32>
+  %temp = memref.clone %ret : memref<5xf32> to memref<5xf32>
+  memref.dealloc %temp : memref<5xf32>
+  return %ret : memref<5xf32>
+}
+// CHECK-NEXT: %[[ret:.*]] = memref.alloc()
+// CHECK-NOT: %[[temp:.*]] = memref.clone
+// CHECK-NOT: memref.dealloc %[[temp]]
+// CHECK: return %[[ret]]
+
+// -----
+
+// CHECK-LABEL: func @clone_loop_alloc
+func @clone_loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>, %arg4: memref<2xf32>) {
+  %0 = memref.alloc() : memref<2xf32>
+  memref.dealloc %0 : memref<2xf32>
+  %1 = memref.clone %arg3 : memref<2xf32> to memref<2xf32>
+  %2 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %1) -> (memref<2xf32>) {
+    %3 = cmpi eq, %arg5, %arg1 : index
+    memref.dealloc %arg6 : memref<2xf32>
+    %4 = memref.alloc() : memref<2xf32>
+    %5 = memref.clone %4 : memref<2xf32> to memref<2xf32>
+    memref.dealloc %4 : memref<2xf32>
+    %6 = memref.clone %5 : memref<2xf32> to memref<2xf32>
+    memref.dealloc %5 : memref<2xf32>
+    scf.yield %6 : memref<2xf32>
+  }
+  linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32>
+  memref.dealloc %2 : memref<2xf32>
+  return
+}
+
+// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone
+// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for
+// CHECK-NEXT: memref.dealloc
+// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc
+// CHECK-NEXT: scf.yield %[[ALLOC2]]
+// CHECK: linalg.copy(%[[ALLOC1]]
+// CHECK-NEXT: memref.dealloc %[[ALLOC1]]
+
+// -----
+
+// CHECK-LABEL: func @clone_nested_region
+func @clone_nested_region(%arg0: index, %arg1: index) -> memref<?x?xf32> {
+  %0 = cmpi eq, %arg0, %arg1 : index
+  %1 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
+  %2 = scf.if %0 -> (memref<?x?xf32>) {
+    %3 = scf.if %0 -> (memref<?x?xf32>) {
+      %9 = memref.clone %1 : memref<?x?xf32> to memref<?x?xf32>
+      scf.yield %9 : memref<?x?xf32>
+    } else {
+      %7 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
+      %10 = memref.clone %7 : memref<?x?xf32> to memref<?x?xf32>
+      memref.dealloc %7 : memref<?x?xf32>
+      scf.yield %10 : memref<?x?xf32>
+    }
+    %6 = memref.clone %3 : memref<?x?xf32> to memref<?x?xf32>
+    memref.dealloc %3 : memref<?x?xf32>
+    scf.yield %6 : memref<?x?xf32>
+  } else {
+    %3 = memref.alloc(%arg1, %arg1) : memref<?x?xf32>
+    %6 = memref.clone %3 : memref<?x?xf32> to memref<?x?xf32>
+    memref.dealloc %3 : memref<?x?xf32>
+    scf.yield %6 : memref<?x?xf32>
+  }
+  memref.dealloc %1 : memref<?x?xf32>
+  return %2 : memref<?x?xf32>
+}
+
+//      CHECK: %[[ALLOC1:.*]] = memref.alloc
+// CHECK-NEXT: %[[ALLOC2:.*]] = scf.if
+// CHECK-NEXT: %[[ALLOC3_1:.*]] = scf.if
+// CHECK-NEXT: %[[ALLOC4_1:.*]] = memref.clone %[[ALLOC1]]
+// CHECK-NEXT: scf.yield %[[ALLOC4_1]]
+//      CHECK: %[[ALLOC4_2:.*]] = memref.alloc
+// CHECK-NEXT: scf.yield %[[ALLOC4_2]]
+//      CHECK: scf.yield %[[ALLOC3_1]]
+//      CHECK: %[[ALLOC3_2:.*]] = memref.alloc
+// CHECK-NEXT: scf.yield %[[ALLOC3_2]]
+//      CHECK: memref.dealloc %[[ALLOC1]]
+// CHECK-NEXT: return %[[ALLOC2]]

diff  --git a/mlir/test/Transforms/copy-removal.mlir b/mlir/test/Transforms/copy-removal.mlir
deleted file mode 100644
index a91c5c2b95287..0000000000000
--- a/mlir/test/Transforms/copy-removal.mlir
+++ /dev/null
@@ -1,361 +0,0 @@
-// RUN: mlir-opt -copy-removal -split-input-file %s | FileCheck %s
-
-// All linalg copies except the linalg.copy(%1, %9) must be removed since the
-// defining operation of %1 and its DeallocOp have been defined in another block.
-
-// CHECK-LABEL: func @nested_region_control_flow_div_nested
-func @nested_region_control_flow_div_nested(%arg0: index, %arg1: index) -> memref<?x?xf32> {
-  %0 = cmpi eq, %arg0, %arg1 : index
-  %1 = memref.alloc(%arg0, %arg0) : memref<?x?xf32>
-  // CHECK: %{{.*}} = scf.if
-  %2 = scf.if %0 -> (memref<?x?xf32>) {
-    // CHECK: %[[PERCENT3:.*]] = scf.if
-    %3 = scf.if %0 -> (memref<?x?xf32>) {
-      %c0_0 = constant 0 : index
-      %7 = memref.dim %1, %c0_0 : memref<?x?xf32>
-      %c1_1 = constant 1 : index
-      %8 = memref.dim %1, %c1_1 : memref<?x?xf32>
-      %9 = memref.alloc(%7, %8) : memref<?x?xf32>
-      // CHECK: linalg.copy({{.*}}, %[[PERCENT9:.*]])
-      linalg.copy(%1, %9) : memref<?x?xf32>, memref<?x?xf32>
-      // CHECK: scf.yield %[[PERCENT9]]
-      scf.yield %9 : memref<?x?xf32>
-    } else {
-      // CHECK: %[[PERCENT7:.*]] = memref.alloc
-      %7 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
-      %c0_0 = constant 0 : index
-      %8 = memref.dim %7, %c0_0 : memref<?x?xf32>
-      %c1_1 = constant 1 : index
-      %9 = memref.dim %7, %c1_1 : memref<?x?xf32>
-      // CHECK-NOT: %{{.*}} = memref.alloc
-      // CHECK-NOT: linalg.copy(%[[PERCENT7]], %{{.*}})
-      // CHECK-NOT: memref.dealloc %[[PERCENT7]]
-      %10 = memref.alloc(%8, %9) : memref<?x?xf32>
-      linalg.copy(%7, %10) : memref<?x?xf32>, memref<?x?xf32>
-      memref.dealloc %7 : memref<?x?xf32>
-      // CHECK: scf.yield %[[PERCENT7]]
-      scf.yield %10 : memref<?x?xf32>
-    }
-    %c0 = constant 0 : index
-    %4 = memref.dim %3, %c0 : memref<?x?xf32>
-    %c1 = constant 1 : index
-    %5 = memref.dim %3, %c1 : memref<?x?xf32>
-    // CHECK-NOT: %{{.*}} = memref.alloc
-    // CHECK-NOT: linalg.copy(%[[PERCENT3]], %{{.*}})
-    // CHECK-NOT: memref.dealloc %[[PERCENT3]]
-    %6 = memref.alloc(%4, %5) : memref<?x?xf32>
-    linalg.copy(%3, %6) : memref<?x?xf32>, memref<?x?xf32>
-    memref.dealloc %3 : memref<?x?xf32>
-    // CHECK: scf.yield %[[PERCENT3]]
-    scf.yield %6 : memref<?x?xf32>
-  } else {
-    // CHECK: %[[PERCENT3:.*]] = memref.alloc
-    %3 = memref.alloc(%arg1, %arg1) : memref<?x?xf32>
-    %c0 = constant 0 : index
-    %4 = memref.dim %3, %c0 : memref<?x?xf32>
-    %c1 = constant 1 : index
-    %5 = memref.dim %3, %c1 : memref<?x?xf32>
-    // CHECK-NOT: %{{.*}} = memref.alloc
-    // CHECK-NOT: linalg.copy(%[[PERCENT3]], %{{.*}})
-    // CHECK-NOT: memref.dealloc %[[PERCENT3]]
-    %6 = memref.alloc(%4, %5) : memref<?x?xf32>
-    linalg.copy(%3, %6) : memref<?x?xf32>, memref<?x?xf32>
-    memref.dealloc %3 : memref<?x?xf32>
-    // CHECK: scf.yield %[[PERCENT3]]
-    scf.yield %6 : memref<?x?xf32>
-  }
-  memref.dealloc %1 : memref<?x?xf32>
-  return %2 : memref<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @simple_test
-func @simple_test() -> memref<5xf32> {
-  %temp = memref.alloc() : memref<5xf32>
-  %ret = memref.alloc() : memref<5xf32>
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  memref.dealloc %ret : memref<5xf32>
-  return %temp : memref<5xf32>
-}
-// CHECK-SAME: () -> memref<5xf32>
-// CHECK-NEXT: %[[ret:.*]] = memref.alloc()
-// CHECK-NOT: linalg.copy(%[[ret]], %{{.*}})
-// CHECK-NOT: memref.dealloc %[[ret]]
-// CHECK: return %[[ret]]
-
-// -----
-
-// It is legal to remove the copy operation that %ret has a usage before the copy
-// operation. The allocation of %temp and the deallocation of %ret should be also
-// removed.
-
-// CHECK-LABEL: func @test_with_ret_usage_before_copy
-func @test_with_ret_usage_before_copy() -> memref<5xf32> {
-  %ret = memref.alloc() : memref<5xf32>
-  %temp = memref.alloc() : memref<5xf32>
-  %c0 = constant 0 : index
-  %dimension = memref.dim %ret, %c0 : memref<5xf32>
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  memref.dealloc %ret : memref<5xf32>
-  return %temp : memref<5xf32>
-}
-// CHECK-NEXT: %[[ret:.*]] = memref.alloc()
-// CHECK-NOT: %{{.*}} = memref.alloc
-// CHECK-NEXT: %{{.*}} = constant
-// CHECK-NEXT: %[[DIM:.*]] = memref.dim %[[ret]]
-// CHECK-NOT: linalg.copy(%[[ret]], %{{.*}})
-// CHECK-NOT: memref.dealloc %[[ret]]
-// CHECK: return %[[ret]]
-
-// -----
-
-// It is illegal to remove a copy operation that %ret has a usage after copy
-// operation.
-
-// CHECK-LABEL: func @test_with_ret_usage_after_copy
-func @test_with_ret_usage_after_copy() -> memref<5xf32> {
-  %ret = memref.alloc() : memref<5xf32>
-  %temp = memref.alloc() : memref<5xf32>
-  // CHECK: linalg.copy
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  %c0 = constant 0 : index
-  %dimension = memref.dim %ret, %c0 : memref<5xf32>
-  memref.dealloc %ret : memref<5xf32>
-  return %temp : memref<5xf32>
-}
-
-// -----
-
-// It is illegal to remove a copy operation that %temp has a usage before copy
-// operation.
-
-// CHECK-LABEL: func @test_with_temp_usage_before_copy
-func @test_with_temp_usage_before_copy() -> memref<5xf32> {
-  %ret = memref.alloc() : memref<5xf32>
-  %temp = memref.alloc() : memref<5xf32>
-  %c0 = constant 0 : index
-  %dimension = memref.dim %temp, %c0 : memref<5xf32>
-  // CHECK: linalg.copy
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  memref.dealloc %ret : memref<5xf32>
-  return %temp : memref<5xf32>
-}
-
-// -----
-
-// It is legal to remove the copy operation that %temp has a usage after the copy
-// operation. The allocation of %temp and the deallocation of %ret could be also
-// removed.
-
-// However the following pattern is not handled by copy removal.
-//   %from = memref.alloc()
-//   %to = memref.alloc()
-//   copy(%from, %to)
-//   read_from(%from) + write_to(%something_else)
-//   memref.dealloc(%from)
-//   return %to
-// In particular, linalg.generic is a memoryEffectOp between copy and dealloc.
-// Since no alias analysis is performed and no distinction is made between reads
-// and writes, the linalg.generic with effects blocks copy removal.
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: func @test_with_temp_usage_after_copy
-func @test_with_temp_usage_after_copy() -> memref<5xf32> {
-  %ret = memref.alloc() : memref<5xf32>
-  %res = memref.alloc() : memref<5xf32>
-  %temp = memref.alloc() : memref<5xf32>
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  linalg.generic {
-    indexing_maps = [#map0, #map0],
-    iterator_types = ["parallel"]}
-    ins(%temp : memref<5xf32>)
-   outs(%res : memref<5xf32>) {
-  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
-    %tmp1 = math.exp %gen1_arg0 : f32
-    linalg.yield %tmp1 : f32
-  }
-  memref.dealloc %ret : memref<5xf32>
-  return %temp : memref<5xf32>
-}
-// CHECK-NEXT: %[[ret:.*]] = memref.alloc()
-// CHECK-NEXT: %[[res:.*]] = memref.alloc()
-// CHECK-NEXT: %[[temp:.*]] = memref.alloc()
-// CHECK-NEXT: linalg.copy(%[[ret]], %[[temp]])
-// CHECK-NEXT: linalg.generic
-//      CHECK: memref.dealloc %[[ret]]
-//      CHECK: return %[[temp]]
-
-// -----
-
-// CHECK-LABEL: func @make_allocation
-func @make_allocation() -> memref<5xf32> {
-  %mem = memref.alloc() : memref<5xf32>
-  return %mem : memref<5xf32>
-}
-
-// CHECK-LABEL: func @test_with_function_call
-func @test_with_function_call() -> memref<5xf32> {
-  // CHECK-NEXT: %[[ret:.*]] = call @make_allocation() : () -> memref<5xf32>
-  %ret = call @make_allocation() : () -> (memref<5xf32>)
-  // CHECK-NOT: %{{.*}} = memref.alloc
-  // CHECK-NOT: linalg.copy(%[[ret]], %{{.*}})
-  // CHECK-NOT: memref.dealloc %[[ret]]
-  %temp = memref.alloc() : memref<5xf32>
-  linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32>
-  memref.dealloc %ret : memref<5xf32>
-  // CHECK: return %[[ret]]
-  return %temp : memref<5xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @multiple_deallocs_in_
diff erent_blocks
-func @multiple_deallocs_in_
diff erent_blocks(%cond : i1) -> memref<5xf32> {
-  // CHECK-NEXT: %[[PERCENT0:.*]] = memref.alloc()
-  %0 = memref.alloc() : memref<5xf32>
-  cond_br %cond, ^bb1, ^bb2
-^bb1:
-  memref.dealloc %0 : memref<5xf32>
-  // CHECK: br ^[[BB3:.*]](%[[PERCENT0]]
-  br ^bb3(%0 : memref<5xf32>)
-^bb2:
-  // CHECK-NOT: %{{.*}} = memref.alloc
-  // CHECK-NOT: linalg.copy(%[[PERCENT0]], %{{.*}})
-  // CHECK-NOT: memref.dealloc %[[PERCENT0]]
-  %temp = memref.alloc() : memref<5xf32>
-  linalg.copy(%0, %temp) : memref<5xf32>, memref<5xf32>
-  memref.dealloc %0 : memref<5xf32>
-  // CHECK: br ^[[BB3]](%[[PERCENT0]]
-  br ^bb3(%temp : memref<5xf32>)
-^bb3(%res : memref<5xf32>):
-  return %res : memref<5xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: func @test_ReuseCopyTargetAsSource
-func @test_ReuseCopyTargetAsSource(%arg0: memref<2xf32>, %result: memref<2xf32>){
-  // CHECK-SAME: (%[[ARG0:.*]]: memref<2xf32>, %[[RES:.*]]: memref<2xf32>)
-  // CHECK-NOT: %{{.*}} = memref.alloc
-  %temp = memref.alloc() : memref<2xf32>
-  // CHECK-NEXT: linalg.generic
-  // CHECK-SAME: ins(%[[ARG0]]{{.*}}outs(%[[RES]]
-  // CHECK-NOT: linalg.copy(%{{.*}}, %[[RES]])
-  // CHECK-NOT: memref.dealloc %{{.*}}
-  linalg.generic {
-    indexing_maps = [#map0, #map0],
-    iterator_types = ["parallel"]}
-    ins(%arg0 : memref<2xf32>)
-   outs(%temp : memref<2xf32>) {
-  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
-    %tmp2 = math.exp %gen2_arg0 : f32
-    linalg.yield %tmp2 : f32
-  }
-  linalg.copy(%temp, %result) : memref<2xf32>, memref<2xf32>
-  memref.dealloc %temp : memref<2xf32>
-  // CHECK: return
-  return
-}
-
-// -----
-
-// Copy operation must not be removed since an operation writes to %to value
-// before copy.
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: func @test_ReuseCopyTargetAsSource
-func @test_ReuseCopyTargetAsSource(%arg0: memref<2xf32>){
-  %to = memref.alloc() : memref<2xf32>
-  %temp = memref.alloc() : memref<2xf32>
-  linalg.generic {
-    indexing_maps = [#map0, #map0],
-    iterator_types = ["parallel"]}
-    ins(%arg0 : memref<2xf32>)
-   outs(%temp : memref<2xf32>) {
-  ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
-    %tmp1 = math.exp %gen1_arg0 : f32
-    linalg.yield %tmp1 : f32
-  }
-  linalg.generic {
-    indexing_maps = [#map0, #map0],
-    iterator_types = ["parallel"]}
-    ins(%arg0 : memref<2xf32>)
-   outs(%to : memref<2xf32>) {
-  ^bb0(%gen2_arg0: f32, %gen2_arg1: f32):
-    %tmp2 = math.exp %gen2_arg0 : f32
-    linalg.yield %tmp2 : f32
-  }
-  // CHECK: linalg.copy
-  linalg.copy(%temp, %to) : memref<2xf32>, memref<2xf32>
-  memref.dealloc %temp : memref<2xf32>
-  return
-}
-
-// -----
-
-// The only redundant copy is linalg.copy(%4, %5)
-
-// CHECK-LABEL: func @loop_alloc
-func @loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>, %arg4: memref<2xf32>) {
-  // CHECK: %{{.*}} = memref.alloc()
-  %0 = memref.alloc() : memref<2xf32>
-  memref.dealloc %0 : memref<2xf32>
-  // CHECK: %{{.*}} = memref.alloc()
-  %1 = memref.alloc() : memref<2xf32>
-  // CHECK: linalg.copy
-  linalg.copy(%arg3, %1) : memref<2xf32>, memref<2xf32>
-  %2 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %1) -> (memref<2xf32>) {
-    %3 = cmpi eq, %arg5, %arg1 : index
-    // CHECK: memref.dealloc
-    memref.dealloc %arg6 : memref<2xf32>
-    // CHECK: %[[PERCENT4:.*]] = memref.alloc()
-    %4 = memref.alloc() : memref<2xf32>
-    // CHECK-NOT: memref.alloc
-    // CHECK-NOT: linalg.copy
-    // CHECK-NOT: memref.dealloc
-    %5 = memref.alloc() : memref<2xf32>
-    linalg.copy(%4, %5) : memref<2xf32>, memref<2xf32>
-    memref.dealloc %4 : memref<2xf32>
-    // CHECK: %[[PERCENT6:.*]] = memref.alloc()
-    %6 = memref.alloc() : memref<2xf32>
-    // CHECK: linalg.copy(%[[PERCENT4]], %[[PERCENT6]])
-    linalg.copy(%5, %6) : memref<2xf32>, memref<2xf32>
-    scf.yield %6 : memref<2xf32>
-  }
-  // CHECK: linalg.copy
-  linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32>
-  memref.dealloc %2 : memref<2xf32>
-  return
-}
-
-// -----
-
-// The linalg.copy operation can be removed in addition to alloc and dealloc
-// operations. All uses of %0 is then replaced with %arg2.
-
-// CHECK-LABEL: func @check_with_affine_dialect
-func @check_with_affine_dialect(%arg0: memref<4xf32>, %arg1: memref<4xf32>, %arg2: memref<4xf32>) {
-  // CHECK-SAME: (%[[ARG0:.*]]: memref<4xf32>, %[[ARG1:.*]]: memref<4xf32>, %[[RES:.*]]: memref<4xf32>)
-  // CHECK-NOT: memref.alloc
-  %0 = memref.alloc() : memref<4xf32>
-  affine.for %arg3 = 0 to 4 {
-    %5 = affine.load %arg0[%arg3] : memref<4xf32>
-    %6 = affine.load %arg1[%arg3] : memref<4xf32>
-    %7 = cmpf ogt, %5, %6 : f32
-    // CHECK: %[[SELECT_RES:.*]] = select
-    %8 = select %7, %5, %6 : f32
-    // CHECK-NEXT: affine.store %[[SELECT_RES]], %[[RES]]
-    affine.store %8, %0[%arg3] : memref<4xf32>
-  }
-  // CHECK-NOT: linalg.copy
-  // CHECK-NOT: dealloc
-  linalg.copy(%0, %arg2) : memref<4xf32>, memref<4xf32>
-  memref.dealloc %0 : memref<4xf32>
-  //CHECK: return
-  return
-}