[Mlir-commits] [mlir] 0d2c54e - [mlir][Linalg] Revisit RAW dependence interference in comprehensive bufferize.

Nicolas Vasilache llvmlistbot at llvm.org
Tue Sep 21 07:22:27 PDT 2021


Author: Nicolas Vasilache
Date: 2021-09-21T14:22:22Z
New Revision: 0d2c54e851f12594b38f45e76ced03e3f5cc5443

URL: https://github.com/llvm/llvm-project/commit/0d2c54e851f12594b38f45e76ced03e3f5cc5443
DIFF: https://github.com/llvm/llvm-project/commit/0d2c54e851f12594b38f45e76ced03e3f5cc5443.diff

LOG: [mlir][Linalg] Revisit RAW dependence interference in comprehensive bufferize.

Previously, comprehensive bufferize would consider all aliasing reads and writes to
the result buffer and matching operand. This resulted in spurious dependences
being considered and resulted in too many unnecessary copies.

Instead, this revision revisits the gathering of read and write alias sets.
This results in fewer alloc and copies.
An exhaustive test cases is added that considers all possible permutations of
`matmul(extract_slice(fill), extract_slice(fill), ...)`.

Added: 
    mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
index c4a66a4b10c54..de1b9e63e8b26 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -743,16 +743,39 @@ class BufferizationAliasInfo {
   /// Set the inPlace bufferization spec to false.
   void bufferizeOutOfPlace(OpResult result);
 
-  /// Return true if it is possible to find an inplace write W among the uses of
-  /// aliasInfo[result], and a read R among the uses of aliasInfo[result],
-  /// such that W and R interfere.
+  /// Return true if it is possible to find an inplace write W among `usesWrite`
+  /// and a read R among `usesRead`, such that W and R interfere.
   /// Such a (W, R) pair is an interference to the inplace bufferization of
-  /// rootWrite when:
+  /// opResult when:
   ///   1. R is not known properly dominate W (i.e. the effects of the write may
   ///      be visible from R).
   ///   2. one cannot find an intermediate clobbering write `C` to W, such that
   ///      C interleaved between W and R (i.e. W -> C -> R where -> denotes
   ///      dominance).
+  bool wouldCreateReadAfterWriteInterference(
+      Operation *opToBufferize, DenseSet<OpOperand *> &usesRead,
+      DenseSet<OpOperand *> &usesWrite, const DominanceInfo &domInfo) const;
+
+  /// Assume that result bufferizes in-place with one of the operation's
+  /// operands. Return true if it is possible to find an inplace write W (resp.
+  /// a read R) among the uses of `aliasInfo[result]`, and a read R (resp. an
+  /// inplace write W) among the uses of
+  /// `aliasInfo[getAliasingOpOperand(result)]`, such that W and R interfere.
+  /// Interference detection is needed to determine which cases may bufferize
+  /// inplace without interferences. Such cases comprise:
+  ///
+  /// ```
+  ///  %0 = op_to_bufferize(%1)
+  ///  read(%1)
+  ///
+  ///  %0 = op_to_bufferize(%1)
+  ///  write(%0)
+  ///  read(%1)
+  ///
+  ///  %0 = op_to_bufferize(%1)
+  ///  write(%1)
+  ///  read(%0)
+  /// ```
   bool
   wouldCreateReadAfterWriteInterference(OpResult result,
                                         const DominanceInfo &domInfo) const;
@@ -828,29 +851,29 @@ class BufferizationAliasInfo {
   ///
   /// Case discussion:
   /// ================
-  /// Case 1: rootRead is produced by opToBufferize,
-  /// Case 2: rootWrite is produced by opToBufferize,
+  /// Case 1: opOperand is produced by opToBufferize,
+  /// Case 2: opResult is produced by opToBufferize,
   /// Common case:
-  ///   - aliasingReadOp is a read to an alias of rootRead.
-  ///   - aliasingWriteOp is an inplace write to an alias of rootWrite.
+  ///   - aliasingReadOp is a read to an alias of opOperand.
+  ///   - aliasingWriteOp is an inplace write to an alias of opResult.
   ///   - aliasingWriteOp dominates aliasingReadOp.
   ///
   /// ```
   ///    // Either case 1:
-  ///    %rootRead = opToBufferize(%rootWrite)
-  ///    aliasingWriteOp(%aliasingWrite = alias(%rootWrite)) // inplace
-  ///     aliasingReadOp( %aliasingRead = alias(%rootRead))
+  ///    %opOperand = opToBufferize(%opResult)
+  ///    aliasingWriteOp(%aliasingWrite = alias(%opResult)) // inplace
+  ///     aliasingReadOp( %aliasingRead = alias(%opOperand))
   /// ```
   ///
   /// ```
   ///    // Or case 2:
-  ///    %rootWrite = opToBufferize(%rootRead)
-  ///    aliasingWriteOp(%aliasingWrite = alias(%rootWrite)) // inplace
-  ///     aliasingReadOp( %aliasingRead = alias(%rootRead))
+  ///    %opResult = opToBufferize(%opOperand)
+  ///    aliasingWriteOp(%aliasingWrite = alias(%opResult)) // inplace
+  ///     aliasingReadOp( %aliasingRead = alias(%opOperand))
   /// ```
   ///
-  /// Capture possible cases where `aliasingWriteOp(alias(%rootWrite))` has no
-  /// visible effect on `aliasingReadOp(alias(%rootRead))`.
+  /// Capture possible cases where `aliasingWriteOp(alias(%opResult))` has no
+  /// visible effect on `aliasingReadOp(alias(%opOperand))`.
   bool isClobberedWriteBeforeRead(Operation *opToBufferize,
                                   OpOperand &aliasingRead,
                                   OpOperand &aliasingWrite,
@@ -969,71 +992,11 @@ void BufferizationAliasInfo::bufferizeOutOfPlace(OpResult result) {
   setInPlaceOpResult(result, InPlaceSpec::False);
 }
 
-/// Return true if it is possible to find an inplace write W among the uses of
-/// aliasInfo[result], and a read R among the uses of aliasInfo[result],
-/// such that W and R interfere.
-/// Such a (W, R) pair is an interference to the inplace bufferization of
-/// rootWrite when:
-///   1. R is not known to properly dominate W (i.e. the effects of the write
-///      may be visible from R).
-///   2. one cannot find an intermediate clobbering write `C` to W, such that
-///      C interleaved between W and R (i.e. W -> C -> R where -> denotes
-///      dominance).
+/// Return true if it is possible to find an inplace write W among `usesWrite`
+/// and a read R among `usesRead`, such that W and R interfere.
 bool BufferizationAliasInfo::wouldCreateReadAfterWriteInterference(
-    OpResult result, const DominanceInfo &domInfo) const {
-  Optional<OpOperand *> maybeAliasingOperand = getAliasingOpOperand(result);
-  if (!maybeAliasingOperand)
-    return false;
-
-  Operation *opToBufferize = result.getDefiningOp();
-  Value rootWrite = result;
-  Value rootRead = (*maybeAliasingOperand)->get();
-
-  LDBG("----Start wouldCreateReadAfterWriteInterference\n");
-  LDBG("--------consider all aliases to root read: " << printValueInfo(rootRead)
-                                                     << "\n");
-  LDBG("--------consider all aliases to root write: "
-       << printValueInfo(rootWrite) << "\n");
-
-  // If `result` were to be bufferized in place, all the aliases of `rootRead`
-  // and `rootWrite` would immediately alias with each other and could create
-  // RaW hazards.
-  // Therefore, for each alias of either `rootRead` or `rootWrite`, we collect:
-  //   1. all of the reads of any alias.
-  //   2. all the write uses of any alias that are already known to bufferize
-  //      inplace.
-  //   3. all the write uses of any alias that belong to `opToBufferize`: as if
-  //      `opToBufferize` were bufferized inplace.
-  DenseSet<OpOperand *> usesRead, usesWrite;
-  for (Value v : {rootRead, rootWrite}) {
-    for (Value alias : getAliases(v)) {
-      for (auto &use : alias.getUses()) {
-        // Read to a value that aliases v.
-        if (bufferizesToMemoryRead(use)) {
-          LDBG("------------bufferizesToMemoryRead: "
-               << use.getOwner()->getName().getStringRef() << "\n");
-          usesRead.insert(&use);
-        }
-        // Inplace write to a value that aliases v.
-        if (bufferizesToMemoryWrite(use, InPlaceSpec::True)) {
-          LDBG("------------bufferizesToMemoryWrite: "
-               << use.getOwner()->getName().getStringRef() << "\n");
-          usesWrite.insert(&use);
-        }
-      }
-    }
-  }
-  // Additionally: consider writes to a value that aliases rootRead and belongs
-  // to opToBufferize. This simulates that opToBufferize bufferizes inplace.
-  for (OpOperand &use : opToBufferize->getOpOperands()) {
-    if (aliasInfo.isEquivalent(rootRead, use.get()) &&
-        bufferizesToMemoryWrite(use)) {
-      LDBG("------------bufferizesToMemoryWrite: "
-           << use.getOwner()->getName().getStringRef() << "\n");
-      usesWrite.insert(&use);
-    }
-  }
-
+    Operation *opToBufferize, DenseSet<OpOperand *> &usesRead,
+    DenseSet<OpOperand *> &usesWrite, const DominanceInfo &domInfo) const {
   for (OpOperand *uRead : usesRead) {
     Operation *aliasingReadOp = uRead->getOwner();
     LDBG("----++++aliasRead -> #"
@@ -1061,7 +1024,8 @@ bool BufferizationAliasInfo::wouldCreateReadAfterWriteInterference(
       // At this point, aliasingWriteOp properly dominates aliasingReadOp or
       // there is no clear dominance and we need to be conservative.
       LDBG("---->found RaW interference between:\n");
-      LDBG("       Source value -> " << printValueInfo(rootRead) << '\n');
+      LDBG("       OpToBufferize -> " << printOperationInfo(opToBufferize)
+                                      << '\n');
       LDBG("       Interfering write -> #"
            << uWrite->getOperandNumber() << ":"
            << printOperationInfo(aliasingWriteOp) << '\n');
@@ -1073,7 +1037,6 @@ bool BufferizationAliasInfo::wouldCreateReadAfterWriteInterference(
         LDBG("---->clobbered! -> skip\n");
         continue;
       }
-
       LDBG("---->not clobbered -> found an interference\n");
       return true;
     }
@@ -1082,6 +1045,111 @@ bool BufferizationAliasInfo::wouldCreateReadAfterWriteInterference(
   return false;
 }
 
+/// Return true if it is possible to find an inplace write W among the uses of
+/// aliasInfo[result], and a read R among the uses of aliasInfo[result],
+/// such that W and R interfere.
+/// Such a (W, R) pair is an interference to the inplace bufferization of
+/// opResult when:
+///   1. R is not known to properly dominate W (i.e. the effects of the write
+///      may be visible from R).
+///   2. one cannot find an intermediate clobbering write `C` to W, such that
+///      C interleaved between W and R (i.e. W -> C -> R where -> denotes
+///      dominance).
+bool BufferizationAliasInfo::wouldCreateReadAfterWriteInterference(
+    OpResult result, const DominanceInfo &domInfo) const {
+  Optional<OpOperand *> maybeAliasingOperand = getAliasingOpOperand(result);
+  if (!maybeAliasingOperand)
+    return false;
+
+  Operation *opToBufferize = result.getDefiningOp();
+  Value opResult = result;
+  Value opOperand = (*maybeAliasingOperand)->get();
+
+  LDBG("----Start wouldCreateReadAfterWriteInterference\n");
+  LDBG("--------consider all aliases to root read: "
+       << printValueInfo(opOperand) << "\n");
+  LDBG("--------consider all aliases to root write: "
+       << printValueInfo(opResult) << "\n");
+
+  /// Helper function to iterate on aliases of `root` and capture the reads.
+  auto getAliasingReads = [&](DenseSet<OpOperand *> &res, Value root) {
+    for (Value alias : getAliases(root)) {
+      for (auto &use : alias.getUses()) {
+        // Read to a value that aliases root.
+        if (bufferizesToMemoryRead(use)) {
+          LDBG("------------bufferizesToMemoryRead: "
+               << use.getOwner()->getName().getStringRef() << "\n");
+          res.insert(&use);
+        }
+      }
+    }
+  };
+
+  /// Helper function to iterate on aliases of `root` and capture the writes.
+  auto getAliasingInplaceWrites = [&](DenseSet<OpOperand *> &res, Value root) {
+    for (Value alias : getAliases(root)) {
+      for (auto &use : alias.getUses()) {
+        // Inplace write to a value that aliases root.
+        if (bufferizesToMemoryWrite(use, InPlaceSpec::True)) {
+          LDBG("------------bufferizesToMemoryWrite: "
+               << use.getOwner()->getName().getStringRef() << "\n");
+          res.insert(&use);
+        }
+      }
+    }
+  };
+
+  // Check if we can find any interference between reads to aliases[`opOperand`]
+  // and writes to aliases[`opResult`]. This handles the case:
+  //
+  // ```
+  //  %0 = op_to_bufferize_maybe_inplace(%1)
+  //  %2 = some_alias(%0)
+  //  inplace_write(%2)
+  //  %3 = some_alias(%1)
+  //  read(%3)
+  // ```
+  DenseSet<OpOperand *> usesRead, usesWrite;
+  LDBG("--------\n");
+  LDBG("--------Test reads(opOperand) vs writes(opResult)\n");
+  getAliasingReads(usesRead, opOperand);
+  getAliasingInplaceWrites(usesWrite, opResult);
+  // Additionally, `result` is not yet bufferized and we need to check for
+  // interferences as if it were bufferized inplace: add `maybeAliasingOperand`
+  // if it is a write. This handles the case:
+  //
+  // ```
+  //  %0 = op_to_bufferize_maybe_inplace(%1)
+  //  %2 = some_alias(%1)
+  //  read(%2)
+  // ```
+  if (bufferizesToMemoryWrite(**maybeAliasingOperand))
+    usesWrite.insert(*maybeAliasingOperand);
+  if (wouldCreateReadAfterWriteInterference(opToBufferize, usesRead, usesWrite,
+                                            domInfo))
+    return true;
+
+  // Check if we can find any interference between writes to
+  // aliases[`opOperand`] and reads to aliases[`opResult`]. This handles the
+  // case:
+  //
+  // ```
+  //  %0 = op_to_bufferize_maybe_inplace(%1)
+  //  %2 = some_alias(%1)
+  //  inplace_write(%2)
+  //  %3 = some_alias(%0)
+  //  read(%3)
+  // ```
+  LDBG("--------\n");
+  LDBG("--------Test reads(opResult) vs writes(opOperand)\n");
+  usesRead.clear();
+  usesWrite.clear();
+  getAliasingReads(usesRead, opResult);
+  getAliasingInplaceWrites(usesWrite, opOperand);
+  return wouldCreateReadAfterWriteInterference(opToBufferize, usesRead,
+                                               usesWrite, domInfo);
+}
+
 /// Return true if the source of a `insertSliceOp` bufferizes to an
 /// equivalent ExtractSliceOp that bufferizes inplace.
 bool BufferizationAliasInfo::isSourceEquivalentToAMatchingInplaceExtractSliceOp(

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir b/mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir
new file mode 100644
index 0000000000000..c454dae886768
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir
@@ -0,0 +1,630 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s
+
+/// All combinations of matmul(fill(extract(init_tensor)), fill(extract(%init_tensor)), %arg2)
+/// These should all be inplaceable except the first op.
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1234(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1243(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1324(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1342(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1423(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_1432(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+                        %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2134(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2143(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2314(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2341(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2413(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_2431(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3124(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3142(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3214(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3241(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %2 = linalg.fill(%cst_0, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %4 = tensor.extract_slice %2[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %4 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3412(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_3421(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4123(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4132(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4213(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %0) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
+
+  %3 = tensor.extract_slice %1[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%3, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4231(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4312(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @fill_extract_matmul_
+func @fill_extract_matmul_4321(
+    %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false},
+    %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true})
+  -> tensor<256x256xf32>
+{
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %0 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  // CHECK-COUNT-4: {__inplace_results_attr__ = ["true"]}
+  %4 = tensor.extract_slice %0[0, 0] [16, 256] [1, 1] : tensor<256x256xf32> to tensor<16x256xf32>
+  %3 = tensor.extract_slice %0[0, 0] [256, 16] [1, 1] : tensor<256x256xf32> to tensor<256x16xf32>
+
+  %2 = linalg.fill(%cst_0, %4) : f32, tensor<16x256xf32> -> tensor<16x256xf32>
+  %1 = linalg.fill(%cst, %3) : f32, tensor<256x16xf32> -> tensor<256x16xf32>
+
+  %5 = linalg.matmul ins(%1, %2 : tensor<256x16xf32>, tensor<16x256xf32>) outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
+  return %5 : tensor<256x256xf32>
+}

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
index b56a9741cadd4..a435a2539220c 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -639,7 +639,7 @@ builtin.func @matmul_on_tensors(
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
   //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
   %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
   %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
 
@@ -673,9 +673,9 @@ builtin.func @matmul_on_tensors(
   %7 = linalg.init_tensor [256, 256] : tensor<256x256xf32>
 
   //     CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
   //      CHECK: vector.transfer_write
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]
   %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
   %9 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
   %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
@@ -683,7 +683,7 @@ builtin.func @matmul_on_tensors(
   //      CHECK: linalg.fill
   // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
   //      CHECK: vector.transfer_write
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]
   %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32>
   %12 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
   %13 = vector.transfer_write %12, %11[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>


        


More information about the Mlir-commits mailing list