[Mlir-commits] [mlir] bf1b952 - [mlir][bufferize] Fix missing copy when bufferizing loops

Fri Aug 12 01:50:04 PDT 2022

Author: Matthias Springer
Date: 2022-08-12T10:44:55+02:00
New Revision: bf1b9528ff3ba1661ce4fd004ac30200312439fc

URL: https://github.com/llvm/llvm-project/commit/bf1b9528ff3ba1661ce4fd004ac30200312439fc
DIFF: https://github.com/llvm/llvm-project/commit/bf1b9528ff3ba1661ce4fd004ac30200312439fc.diff

LOG: [mlir][bufferize] Fix missing copy when bufferizing loops

Using a loop init_arg inside of the loop is not supported. This change adds a pre-processing pass that resolves such IR with copies.

Differential Revision: https://reviews.llvm.org/D131689

Added: 
    

Modified: 
    mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
    mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
    mlir/test/Dialect/SCF/one-shot-bufferize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
index 786f12f389f4b..275238a776a1f 100644

--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -20,8 +20,82 @@
 using namespace mlir;
 using namespace mlir::bufferization;
 
+/// Resolve all operands that are also used inside of repetitive regions of the
+/// same op. Such cases are not fully supported by One-Shot Bufferize.
+///
+/// E.g.:
+/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
+///   "some_use"(%tensor)
+///   ...
+/// }
+///
+/// Is converted to:
+/// %tensor_copy = bufferization.alloc_tensor copy(%tensor)
+/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
+///   "some_use"(%tensor_copy)
+///   ...
+/// }
+static void
+resolveUsesInRepetitiveRegions(Operation *op,
+                               const BufferizationOptions &options) {
+  IRRewriter rewriter(op->getContext());
+  AnalysisState state(options);
+
+  // Look for repetitive ops (loops).
+  op->walk([&](RegionBranchOpInterface regionBranchOp) {
+    // Skip non-bufferizable ops.
+    auto bufferizableOp = options.dynCastBufferizableOp(regionBranchOp);
+    if (!bufferizableOp)
+      return WalkResult::advance();
+
+    // Find all operands that are also used inside of a repetitve region of this
+    // op.
+    for (OpOperand &opOperand : regionBranchOp->getOpOperands()) {
+      Value operand = opOperand.get();
+      // Skip non-tensor operands.
+      if (!operand.getType().isa<TensorType>())
+        continue;
+      // Skip operands that do not bufferize to memory writes.
+      if (!bufferizableOp.bufferizesToMemoryWrite(opOperand, state))
+        continue;
+
+      // Gather all uses inside repetitive regions.
+      SmallVector<OpOperand *> usesInsideRegion;
+      for (OpOperand &use : operand.getUses()) {
+        Operation *owner = use.getOwner();
+        if (!regionBranchOp->isProperAncestor(owner))
+          continue;
+        for (Region &r : regionBranchOp->getRegions()) {
+          if (r.findAncestorOpInRegion(*owner) &&
+              regionBranchOp.isRepetitiveRegion(r.getRegionNumber())) {
+            usesInsideRegion.push_back(&use);
+            break;
+          }
+        }
+      }
+      // Nothing to do if the operand is not used inside a repetitive region.
+      if (usesInsideRegion.empty())
+        continue;
+
+      // Insert a tensor copy and replace all uses inside of repetitive regions.
+      rewriter.setInsertionPoint(regionBranchOp);
+      auto tensorCopy = rewriter.create<AllocTensorOp>(
+          regionBranchOp->getLoc(), operand.getType().cast<TensorType>(),
+          /*dynamicSizes=*/ValueRange(),
+          /*copy=*/operand, /*memory_space=*/IntegerAttr());
+      for (OpOperand *use : usesInsideRegion)
+        use->set(tensorCopy);
+    }
+
+    return WalkResult::advance();
+  });
+}
+
 LogicalResult mlir::bufferization::insertTensorCopies(
     Operation *op, const OneShotBufferizationOptions &options) {
+  // Preprocessing: Resolve currently unsupported bufferization cases.
+  resolveUsesInRepetitiveRegions(op, options);
+
   OneShotAnalysisState state(op, options);
   // Run normal One-Shot Bufferize analysis or One-Shot Module Bufferize
   // analysis depending on whether function boundary bufferization is enabled or

diff  --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
index dbf1ccca2979b..6b9a805c7ab11 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -168,7 +168,8 @@ func.func @matmul(
   %c16 = arith.constant 16 : index
 
   // Hoisted alloc.
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32>
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<128x192xf32>
+  // CHECK: memref.copy %[[C]], %[[ALLOC]]
 
   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
@@ -180,12 +181,14 @@ func.func @matmul(
       %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
         tensor<256x192xf32> to tensor<256x16xf32>
 
-      // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
+      // C was already replaced with a copy by preprocessing, so no copy is
+      // needed here.
+      // CHECK: %[[C_SLICE:.*]] = memref.subview %[[ALLOC]]
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>
 
       // linalg.fill is inplace.
-      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]] : memref<8x16xf32>)
+      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]]
       %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
 
       // CHECK: scf.for %[[K:.*]] =
@@ -196,7 +199,7 @@ func.func @matmul(
           tensor<256x16xf32> to tensor<32x16xf32>
 
         // linalg.matmul is inplace as well as the enclosing scf.for.
-        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
+        // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
           -> tensor<8x16xf32>
@@ -207,15 +210,16 @@ func.func @matmul(
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
       // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
-      // CHECK: memref.copy %[[ALLOC]], %[[T]]
+      // CHECK: memref.copy %[[C_SLICE]], %[[T]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>
 
-      // CHECK: memref.dealloc %[[ALLOC]]
       scf.yield %7 : tensor<128x192xf32>
     }
     scf.yield %2 : tensor<128x192xf32>
   }
+
+  // CHECK: memref.dealloc %[[ALLOC]]
   return %0 : tensor<128x192xf32>
 }
 

diff  --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index ab1db2d678d90..71ee9922f8f52 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -233,15 +233,17 @@ func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32
 // CHECK-LABEL: func @scf_for_yield_non_equivalent(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
-//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[alloc]]
-//       CHECK:   memref.dealloc %[[alloc]]
+//       CHECK:   memref.copy %[[t]], %[[alloc]]
+//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[t]]
 //       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
 //   CHECK-DAG:     memref.dealloc %[[iter]]
 //   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[t]], %[[alloc2]]
-//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[alloc2]]
+//       CHECK:     memref.copy %[[alloc]], %[[alloc2]]
+//       CHECK:     %[[alloc2_casted:.*]] = memref.cast %[[alloc2]]
+//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[alloc2_casted]]
 //       CHECK:     memref.dealloc %[[alloc2]]
 //       CHECK:     scf.yield %[[cloned2]]
+//       CHECK:   memref.dealloc %[[alloc]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_non_equivalent(
     %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
@@ -709,3 +711,34 @@ func.func @scf_for_swapping_yields_memory_space(
   %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
   return %f0, %f1: f32, f32
 }
+
+// -----
+
+// CHECK-LABEL: func @scf_for_yield_alias_of_non_equivalent(
+func.func @scf_for_yield_alias_of_non_equivalent(%sz: index) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 5.0 : f32
+
+  // CHECK: %[[generate:.*]] = memref.alloc
+  %0 = tensor.generate %sz {
+  ^bb0(%i: index):
+    tensor.yield %cst : f32
+  } : tensor<?xf32>
+
+  // A copy is inserted because %t is used inside the loop.
+  // CHECK: %[[generate_copy:.*]] = memref.alloc
+  // CHECK: memref.copy %[[generate]], %[[generate_copy]]
+  // CHECK: scf.for
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %0) -> tensor<?xf32> {
+    %iv_sub = arith.subi %iv, %c1 : index
+    // CHECK: memref.subview %[[generate_copy]]
+    %ll = tensor.extract_slice %0[%iv_sub][%sz][1] : tensor<?xf32> to tensor<?xf32>
+    %l = tensor.extract %ll[%c0] : tensor<?xf32>
+    %double = arith.mulf %cst, %l : f32
+    // CHECK: memref.store %{{.*}}, %[[generate]]
+    %s = tensor.insert %double into %t[%iv] : tensor<?xf32>
+    scf.yield %s : tensor<?xf32>
+  }
+  return %r : tensor<?xf32>
+}