[Mlir-commits] [mlir] 76b1601 - [mlir][bufferize] Fix config not passed to greedy rewriter

Tue Mar 15 01:32:46 PDT 2022

Author: Matthias Springer
Date: 2022-03-15T17:32:38+09:00
New Revision: 76b1601001b5262686bde364407d0dc9ffd52287

URL: https://github.com/llvm/llvm-project/commit/76b1601001b5262686bde364407d0dc9ffd52287
DIFF: https://github.com/llvm/llvm-project/commit/76b1601001b5262686bde364407d0dc9ffd52287.diff

LOG: [mlir][bufferize] Fix config not passed to greedy rewriter

Also add a TODO to switch to a custom walk instead of the GreedyPatternRewriter, which should be more efficient. (The bufferization pattern is guaranteed to apply only a single time for every op, so a simple walk should suffice.)

We currently specify a top-to-bottom walk order. This is important because other walk orders could introduce additional casts and/or buffer copies. These canonicalize away again, but it is more efficient to never generate them in the first place.

Note: A few of these canonicalizations are not yet implemented.

Differential Revision: https://reviews.llvm.org/D121518

Added: 
    

Modified: 
    mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
    mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index f4fed424391fa..510d1e13ecc72 100644

--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -306,14 +306,21 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
   // Bufferize ops top-to-bottom. When creating a new op, we should ideally
   // know the exact memref type of all operands. Otherwise, we have to use a
   // memref type with a fully dynamic layout map, which has to canonicalize
-  // away.
-  // Moreover, if "fullyDynamicLayoutMaps = false", we may otherwise have to
-  // insert buffer copies to fold ("finalize") to_memref(to_tensor(x)) ops with
-  // non-cast-compatible layout maps.
+  // away. This is less efficient.
+  //
+  // Note: If "fullyDynamicLayoutMaps = false", we may have to insert buffer
+  // copies to fold ("finalize") to_memref(to_tensor(x)) ops with non-cast-
+  // compatible layout maps when doing a traversal other than top-to-bottom.
+  // There are currently no canonicalization patterns to fold these away.
   GreedyRewriteConfig config;
   config.useTopDownTraversal = true;
 
-  if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+  // TODO: Perform a preorder walk instead of the greedy pattern rewriter. This
+  // would be more efficient because every bufferization pattern is guaranteed
+  // to apply only a single time (otherwise, an assertion would be triggered).
+  // However, there are restrictions wrt. erasing ops during a preorder walk,
+  // which would likely require a larger refactoring.
+  if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
     return failure();
 
   return checkBufferizationResult(op, state.getOptions());

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
index fc0ac2d384626..b49ef6edcfca2 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -132,8 +132,6 @@ func @unused_unknown_op(%t1 : tensor<?xf32>) -> vector<5xf32> {
 
 // -----
 
-// CHECK: #[[$MAP3:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
 // CHECK-LABEL: func @unknown_op_may_read(
 func @unknown_op_may_read(%v: vector<5xf32>)
     -> (tensor<10xf32>, tensor<10xf32>) {
@@ -142,27 +140,25 @@ func @unknown_op_may_read(%v: vector<5xf32>)
 
   // One alloc for the init_tensor, another one because the transfer_write
   // bufferizes out-of-place.
-  // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
-  // CHECK: %[[alloc_casted:.*]] = memref.cast %[[alloc]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
-  // CHECK: %[[m1_casted:.*]] = memref.cast %[[m1]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
+  // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   %t1 = linalg.init_tensor [10] : tensor<10xf32>
 
   // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[m1]]
-  // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1_casted]]
+  // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1]]
   %filled = linalg.fill ins(%cst : f32) outs(%t1 : tensor<10xf32>) -> tensor<10xf32>
 
   // The transfer_write is out-of-place because "dummy_op" may read.
   // CHECK: memref.copy %[[m1]], %[[alloc]]
   // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
-  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc_casted]]
+  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc]]
   %1 = vector.transfer_write %v, %filled[%idx] : vector<5xf32>, tensor<10xf32>
 
   // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[filled_tensor]])
   %2 = "test.dummy_op"(%filled) : (tensor<10xf32>) -> (tensor<10xf32>)
 
-  // CHECK: memref.dealloc %[[alloc]]
-  // CHECK: memref.dealloc %[[m1]]
+  // CHECK-DAG: memref.dealloc %[[alloc]]
+  // CHECK-DAG: memref.dealloc %[[m1]]
   // CHECK: return %[[alloc_tensor]], %[[dummy]]
   return %1, %2 : tensor<10xf32>, tensor<10xf32>
 }

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir
index ec20cd89c52e6..5aab59311305b 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-alloca.mlir
@@ -31,19 +31,19 @@ func @main() {
   %v1 = arith.constant 1.0 : f32
   %v2 = arith.constant 2.0 : f32
 
-  // CHECK-NEXT:   %[[A:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32>
-  // CHECK-NEXT:   %[[B:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32>
   // CHECK-NEXT:   %[[C:.*]] = memref.alloca() {alignment = 128 : i64} : memref<f32>
-  // CHECK-NEXT:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  // CHECK-NEXT:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  // CHECK-NEXT:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
+  // CHECK-NEXT:   %[[B:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[A:.*]] = memref.alloca() {alignment = 128 : i64} : memref<64xf32>
+  //  CHECK-DAG:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
   %A = linalg.init_tensor [64] : tensor<64xf32>
   %B = linalg.init_tensor [64] : tensor<64xf32>
   %C = linalg.init_tensor [] : tensor<f32>
 
-  // CHECK-NEXT:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
-  // CHECK-NEXT:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
-  // CHECK-NEXT:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
   %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
   %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
   %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
index 7b510b48c9cfd..84fbca8b95b60 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
@@ -20,7 +20,6 @@ func @buffer_forwarding_conflict(
   // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place
   // extract_slice.
   //     CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
-  //     CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
   %a = linalg.init_tensor[%sz] : tensor<?xf32>
 
   //     CHECK: linalg.fill ins({{.*}} : f32) outs(%[[EXTRACT_SLICE_ALLOC]] : memref<?xf32>)
@@ -31,6 +30,7 @@ func @buffer_forwarding_conflict(
   //     CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]] : memref<?xf32> to memref<?xf32>
   %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor<?xf32> into tensor<?xf32>
 
+  //     CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
   //     CHECK: memref.copy %[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]]
   %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
 

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index 0baf61ff49773..5d0087ff29910 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -187,9 +187,9 @@ func @insert_slice_fun(%A0 : tensor<?xf32> {linalg.inplaceable = false},
   ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
 {
   // Hoisted allocs.
-  //      CHECK: %[[REALLOC3:.*]] = memref.alloc
-  //      CHECK: %[[REALLOC2:.*]] = memref.alloc
   //      CHECK: %[[REALLOC1:.*]] = memref.alloc
+  //      CHECK: %[[REALLOC2:.*]] = memref.alloc
+  //      CHECK: %[[REALLOC3:.*]] = memref.alloc
 
   // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
   //      CHECK: memref.copy %[[A0]], %[[REALLOC3]]
@@ -414,12 +414,12 @@ func private @some_external_func(tensor<4xi32>)
 
 //      CHECK: func @main()
 func @main() {
-//      CHECK:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
+//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
   %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
 
-//      CHECK:   %[[alloc:.*]] = memref.alloc
-//      CHECK:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
-//      CHECK:   memref.copy %[[A]], %[[alloc]]
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
+//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
 //      CHECK:   call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
   call @some_external_func(%A) : (tensor<4xi32>) -> ()
 
@@ -436,12 +436,12 @@ func private @some_external_func_within_scf_execute(tensor<4xi32>)
 
 //      CHECK: func @main()
 func @main() {
-//      CHECK:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
+//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
   %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
 
-//      CHECK:   %[[alloc:.*]] = memref.alloc
-//      CHECK:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
-//      CHECK:   memref.copy %[[A]], %[[alloc]]
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
+//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
 //      CHECK:   call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
   scf.execute_region {
     call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> ()
@@ -552,14 +552,14 @@ func @bar(
     %lb : index, %ub : index, %step : index)
   -> (tensor<?xf32>, tensor<?xf32>)
 {
-//      CHECK:   call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]]
+//  CHECK-DAG:   call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]]
   %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) :
       (tensor<?xf32>, tensor<?xf32>, tensor<4xf32>, index, index, index)
         -> (tensor<?xf32>, tensor<?xf32>)
 
   // %r0#0 requires a copy because we have no idea what the function is doing.
-//      CHECK:   %[[alloc:.*]] = memref.alloc
-//      CHECK:   %[[casted:.*]] = memref.cast %[[alloc]]
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[casted:.*]] = memref.cast %[[alloc]]
 //      CHECK:   memref.copy %[[B]], %[[alloc]]
 // CHECK-NEXT:   call @some_external_func(%[[casted]]) : (memref<?xf32, #[[$DYN_1D_MAP]]>) -> ()
   call @some_external_func(%r0#0) : (tensor<?xf32>) -> ()
@@ -601,19 +601,19 @@ func @main() {
   %v1 = arith.constant 1.0 : f32
   %v2 = arith.constant 2.0 : f32
 
-  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
-  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
   // CHECK-NEXT:   %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
-  // CHECK-NEXT:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  // CHECK-NEXT:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  // CHECK-NEXT:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
+  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  //  CHECK-DAG:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
   %A = linalg.init_tensor [64] : tensor<64xf32>
   %B = linalg.init_tensor [64] : tensor<64xf32>
   %C = linalg.init_tensor [] : tensor<f32>
 
-  // CHECK-NEXT:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
-  // CHECK-NEXT:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
-  // CHECK-NEXT:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
   %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
   %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
   %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>
@@ -725,7 +725,6 @@ func @matmul(
         tensor<256x192xf32> to tensor<256x16xf32>
 
       // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
-      // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>
 
@@ -751,6 +750,7 @@ func @matmul(
       // insert_slice is inplace but its source comes from an equivalent buffer
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
+      // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
       // CHECK: memref.copy %[[ALLOC]], %[[T]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>