[Mlir-commits] [mlir] 55c274d - [mlir][Linalg] Drop comprehensive-func-bufferize (12/n)

Thu Jul 1 04:40:38 PDT 2021

Author: Nicolas Vasilache
Date: 2021-07-01T11:36:24Z
New Revision: 55c274d7d30eb4de129a70bf48a063e740b71c9c

URL: https://github.com/llvm/llvm-project/commit/55c274d7d30eb4de129a70bf48a063e740b71c9c
DIFF: https://github.com/llvm/llvm-project/commit/55c274d7d30eb4de129a70bf48a063e740b71c9c.diff

LOG: [mlir][Linalg] Drop comprehensive-func-bufferize (12/n)

This revision drops the comprehensive bufferization Function pass, which has issues when trying to bufferize constants.
Instead, only support the comprehensive-module-bufferize by default.

Differential Revision: https://reviews.llvm.org/D105228

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Linalg/Passes.h
    mlir/include/mlir/Dialect/Linalg/Passes.td
    mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir

Removed: 
    mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
    mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
    mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index d80eb9a0652de..27bb50d5a2f2c 100644

--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -56,18 +56,13 @@ std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToParallelLoopsPass();
 /// Placeholder for now, this is NYI.
 std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToAffineLoopsPass();
 
-/// Create a pass that bufferizes the body of a FuncOp and tries to reuse the
-/// buffers for those arguments that:
-///   a) have been annotated 'inplaceable' and
-///   b) whose buffer uses would be free of memory hazards.
-std::unique_ptr<Pass> createLinalgComprehensiveFuncBufferizePass();
-
 /// This pass implements a cross-dialect bufferization approach and performs an
 /// analysis to determine which op operands and results may be bufferized in the
 /// same buffers. The analysis is performed on topologically sorted CallOp and
 /// FuncOp within a module. It provides analyses and bufferization across
-/// function boundaries. Within a single function body, the bufferization used
-/// is that provided by `LinalgComprehensiveFuncBufferizePass`.
+/// function boundaries. Within a function boundary, the analysis is performed
+/// on SSA use-def chains starting from function operands that are annotated
+/// with the 'inplaceable' attribute.
 std::unique_ptr<Pass> createLinalgComprehensiveModuleBufferizePass();
 
 /// Create a pass to convert Linalg operations which work on tensors to use

diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 3d9833061a090..c638294b12109 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -22,26 +22,6 @@ def ConvertElementwiseToLinalg : FunctionPass<"convert-elementwise-to-linalg"> {
   let dependentDialects = ["linalg::LinalgDialect", "memref::MemRefDialect"];
 }
 
-def LinalgComprehensiveFuncBufferize :
-    FunctionPass<"linalg-comprehensive-func-bufferize"> {
-  let summary = "Bufferize (tensor into memref) the body of a FuncOp and try "
-    "to reuse the buffers for those arguments that "
-    "a) have been annotated 'inplaceable' and "
-    "b) whose buffer uses would be free of memory hazards";
-  let description = [{
-    This pass implements a cross-dialect bufferization approach and performs an
-    analysis to determine which op operands and results may be bufferized in the
-    same buffers. The analysis is performed on SSA use-def chains starting from
-    function operands that are annotated with the 'inplaceable' attribute.
-  }];
-  let options = [
-    Option<"testAnalysisOnly", "test-analysis-only", "bool",
-            /*default=*/"false",
-           "Only runs inplaceability analysis (for testing purposes only)">
-  ];
-  let constructor = "mlir::createLinalgComprehensiveFuncBufferizePass()";
-}
-
 def LinalgComprehensiveModuleBufferize :
     Pass<"linalg-comprehensive-module-bufferize", "ModuleOp"> {
   let summary = "Bufferize (tensor into memref) for a Module.";
@@ -50,8 +30,9 @@ def LinalgComprehensiveModuleBufferize :
     analysis to determine which op operands and results may be bufferized in the
     same buffers. The analysis is performed on topologically sorted CallOp and
     FuncOp within a module. It provides analyses and bufferization across
-    function boundaries. Within a single function body, the bufferization used
-    is that provided by `-linalg-comprehensive-func-bufferize`.
+    function boundaries. Within a function boundary, the analysis is performed
+    on SSA use-def chains starting from function operands that are annotated
+    with the 'inplaceable' attribute.
   }];
   let options = [
     Option<"testAnalysisOnly", "test-analysis-only", "bool",

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
index 03191a85e506c..dec08dfd4da2c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -979,10 +979,10 @@ bool BufferizationAliasInfo::isSourceEquivalentToAMatchingExtractSliceOp(
 /// Apply `fun` to all the members of the equivalence class of `v`.
 void BufferizationAliasInfo::applyOnEquivalenceClass(
     Value v, function_ref<void(Value)> fun) const {
-  for (auto it = equivalentInfo.findLeader(v),
-            eit = equivalentInfo.member_end();
-       it != eit; ++it) {
-    fun(v);
+  auto leaderIt = equivalentInfo.findLeader(v);
+  for (auto mit = leaderIt, meit = equivalentInfo.member_end(); mit != meit;
+       ++mit) {
+    fun(mit->v);
   }
 }
 
@@ -1485,9 +1485,8 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
               getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo)) {
         Value oldRes = callOp->getResult(returnOperand.getOperandNumber());
         int64_t idx = bbArg.getArgNumber();
-        Value buffer = bvm.lookupOrNull(callOp->getOperand(idx));
-        if (!buffer)
-          return callOp->emitError() << "operand #" << idx << " not bufferized";
+        Value buffer = lookup(bvm, callOp->getOperand(idx));
+        assert(buffer && "expected bufferized value");
         // Add CallOp operand/result equivalence: this is interprocedural info.
         aliasInfo.insertNewBufferEquivalence(oldRes, buffer);
         map(bvm, oldRes, buffer);
@@ -1504,11 +1503,11 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
         continue;
       }
 
-      // TODO: Need to hoist above function boundary and add to
-      // `hoistedArgumentTypes`.
-      if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo))
-        return allocOp->emitError()
-               << " needs hoist across function boundary\n";
+      // TODO: Need to hoist above function boundary.
+      if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) {
+        hoistedArguments.push_back(allocOp->getResult(0));
+        continue;
+      }
 
       // Other cases legitimately need to return a tensor, this is currently not
       // supported. For instance, if hoisting across function boundary has
@@ -1518,13 +1517,14 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
 
       int64_t returnIdx = returnOperand.getOperandNumber();
       return returnOp->emitError()
-             << " bufferize result #" << returnIdx << "\n";
+             << "buffer result #" << returnIdx << " not produced by an alloc\n";
     }
   }
 
   // 2. Compute bufferized FunctionType.
   SmallVector<Type> argumentTypes{callOp->getOperandTypes()};
-  llvm::append_range(argumentTypes, ValueRange{hoistedArguments}.getTypes());
+  ValueRange hoistedArgs{hoistedArguments};
+  llvm::append_range(argumentTypes, hoistedArgs.getTypes());
   // Get the bufferized FunctionType for funcOp or construct it if not yet
   // available.
   FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType(
@@ -1543,8 +1543,8 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
 
     // Tensor operands are guaranteed to have been buferized.
     int64_t idx = opOperand.getOperandNumber();
-    Value buffer = bvm.lookupOrNull(tensorOperand);
-    assert(buffer && " missing buffer for operand");
+    Value buffer = lookup(bvm, tensorOperand);
+    assert(buffer && "expected bufferized value");
 
     // Caller / callee type mistmatch is handled with a CastOp.
     auto memRefType = bufferizedFuncType.getInput(idx);
@@ -1592,7 +1592,7 @@ static LogicalResult bufferize(OpBuilder &b, tensor::CastOp castOp,
           ? rankedMemRefType.getAffineMaps()
           : ArrayRef<AffineMap>{};
   Type memRefType = getContiguousOrUnrankedMemRefType(
-      castOp.getResult().getType(), {}, memorySpace);
+      castOp.getResult().getType(), affineMaps, memorySpace);
   Value res = b.create<memref::CastOp>(castOp.getLoc(), memRefType,
                                        lookup(bvm, castOp.source()));
   aliasInfo.insertNewBufferEquivalence(res, castOp.getResult());
@@ -2176,64 +2176,21 @@ static LogicalResult bufferizeFuncOpInternals(
   return failure(result.wasInterrupted());
 }
 
-namespace {
-struct LinalgComprehensiveFuncBufferize
-    : public LinalgComprehensiveFuncBufferizeBase<
-          LinalgComprehensiveFuncBufferize> {
-  void runOnFunction() override;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, memref::MemRefDialect>();
-  }
-};
-} // end namespace
-
-void LinalgComprehensiveFuncBufferize::runOnFunction() {
-  auto funcOp = getFunction();
-
-  // Analysis phase.
-  DominanceInfo domInfo(funcOp);
-  BufferizationAliasInfo aliasInfo(funcOp);
-  // If the analysis fails, just return. This is expected to reset the IR and no
-  // single OpResult should be marked inPlace.
-  if (failed(inPlaceAnalysisFuncOpBody(funcOp, aliasInfo, domInfo))) {
-    signalPassFailure();
-    return;
-  }
-
-  if (testAnalysisOnly)
-    return;
-
-  // Bufferization phase.
-  BlockAndValueMapping bvm;
-  DenseMap<FuncOp, FunctionType> bufferizedFunctionTypes;
-  if (failed(bufferizeFuncOpInternals(funcOp, bvm, aliasInfo,
-                                      bufferizedFunctionTypes)))
-    signalPassFailure();
-
-  // Post-pass cleanup of inplaceable attributes.
-  funcOp.walk([&](Operation *op) { op->removeAttr(kInPlaceResultsAttrName); });
-}
-
-std::unique_ptr<Pass> mlir::createLinalgComprehensiveFuncBufferizePass() {
-  return std::make_unique<LinalgComprehensiveFuncBufferize>();
-}
-
 //===----------------------------------------------------------------------===//
 // Bufferization entry-point for modules.
 //===----------------------------------------------------------------------===//
 
-/// Return the op with Allocate MemoryEffect if `v` is equivalent to an such
+/// Return the op with Allocate MemoryEffect if `v` is equivalent to such an
 /// an op. Return null otherwise.
 static Operation *getEquivalentAlloc(Value value,
                                      const BufferizationAliasInfo &aliasInfo) {
-  Operation *res;
+  Operation *res = nullptr;
   aliasInfo.applyOnEquivalenceClass(value, [&](Value v) {
     if (!res)
       if (auto interface =
               dyn_cast_or_null<MemoryEffectOpInterface>(v.getDefiningOp()))
         if (auto effect =
-                interface.getEffectOnValue<MemoryEffects::Allocate>(value))
+                interface.getEffectOnValue<MemoryEffects::Allocate>(v))
           res = v.getDefiningOp();
   });
   return res;
@@ -2249,9 +2206,12 @@ getEquivalentEnclosingFuncBBArg(Value v,
   if (!funcOp)
     funcOp = op->getParentOfType<FuncOp>();
   assert(funcOp && "expected non-null FuncOp");
-  for (BlockArgument bbArg : funcOp.getArguments())
+  for (BlockArgument bbArg : funcOp.getArguments()) {
+    if (!bbArg.getType().isa<RankedTensorType>())
+      continue;
     if (aliasInfo.areEquivalentBufferizedValues(v, bbArg))
       return bbArg;
+  }
   return nullptr;
 }
 
@@ -2292,9 +2252,6 @@ static LogicalResult bufferizeFuncOpBoundary(
   //    externally).
   // -> Figure out a better layering.
   TypeRange resultTypes;
-  FunctionType bufferizedFuncType =
-      getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(),
-                                        resultTypes, bufferizedFunctionTypes);
 
   // Corner case: Bodiless FuncOp
   // ============================
@@ -2305,6 +2262,9 @@ static LogicalResult bufferizeFuncOpBoundary(
     if (llvm::any_of(funcOp.getType().getResults(), isaTensor))
       return funcOp->emitError() << "cannot bufferize bodiless function that "
                                  << "returns a tensor";
+    FunctionType bufferizedFuncType =
+        getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(),
+                                          TypeRange{}, bufferizedFunctionTypes);
     funcOp.setType(bufferizedFuncType);
     LLVM_DEBUG(DBGS() << "End bufferizeFuncOpBoundary no fun body: " << funcOp);
     return success();
@@ -2323,16 +2283,29 @@ static LogicalResult bufferizeFuncOpBoundary(
     Value returnVal = returnOperand.get();
     if (getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo))
       continue;
-    // TODO: Need to hoist above function boundary. If this is not possible due
-    // to data-depedent sizes, we need a better type than memref.
-    if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo))
-      return allocOp->emitError() << " needs hoist across function boundary\n";
+
+    // TODO: Need to hoist above function boundary.
+    if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) {
+      returnValues.push_back(allocOp->getResult(0));
+      continue;
+    }
+
+    // Other cases legitimately need to return a tensor, this is currently not
+    // supported. For instance, if hoisting across function boundary has
+    // failed, it may be due to e.g. data-dependent sizes. In such a case, we
+    // would need a better type than memref.
     int64_t returnIdx = returnOperand.getOperandNumber();
-    return returnOp->emitError() << " bufferize result #" << returnIdx << "\n";
+    return returnOp->emitError()
+           << "buffer result #" << returnIdx << " not produced by an alloc\n";
   }
 
   // 2. Rewrite the terminator without the inPlace bufferizable values.
-  OpBuilder(returnOp).create<ReturnOp>(returnOp.getLoc(), returnValues);
+  ValueRange retValues{returnValues};
+  FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType(
+      funcOp, funcOp.getType().getInputs(), retValues.getTypes(),
+      bufferizedFunctionTypes);
+  OpBuilder b(returnOp);
+  b.create<ReturnOp>(returnOp.getLoc(), returnValues);
   returnOp->erase();
 
   // 3. Rewrite the bbArgs.

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
deleted file mode 100644
index 41e698f97c873..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file -verify-diagnostics
-
-// -----
-
-func @scf_for(%A : tensor<?xf32>,
-              %B : tensor<?xf32> {linalg.inplaceable = true},
-              %C : tensor<4xf32>,
-              %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // Throw a wrench in the system by swapping yielded values: this result in a
-    // ping-pong of values at each iteration on which we currently want to fail.
-
-    // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}}
-    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
-  }
-
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
deleted file mode 100644
index 5234d85b0b5b1..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
+++ /dev/null
@@ -1,474 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Simple cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_fun
-func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
-  -> (tensor<4xf32>, tensor<8xf32>)
-{
-  // tensor.extract_slice is not used in a write, it is not compelled to
-  // bufferize out of place. Let callers decide whether they want to create
-  // aliasing subviews at all call sites or whether they allocate.
-  // This is true irrespective of whether the function argument is inplaceable.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
-
-  return %r0, %r1: tensor<4xf32>, tensor<8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(
-    %A : tensor<?xf32>,
-    %B : tensor<?xf32> {linalg.inplaceable = true},
-    %C : tensor<4xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // must bufferize out of place.
-  //     CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // bufferizes inplace.
-  //     CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conflict_on_B
-func @conflict_on_B(
-    %A : tensor<4x4xf32> {linalg.inplaceable = true},
-    %B : tensor<4x4xf32> {linalg.inplaceable = true})
-  -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // matmul output operand interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %C = linalg.matmul  ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand does not interferes with input operand.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Length-1 producer-consumer cases.
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_extract_slice
-func @extract_slice_extract_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
-  -> (tensor<2xf32>, tensor<2xf32>)
-{
-  // tensor.extract_slice is not used in a write, it is not compelled to
-  // bufferize out of place. Let callers decide whether they want to create
-  // aliasing subviews at all call sites or whether they allocate.
-  // This is true irrespective of whether the function argument is inplaceable.
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
-  return %r1, %r3: tensor<2xf32>, tensor<2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_insert_slice
-func @insert_slice_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %A2 : tensor<4xf32> {linalg.inplaceable = true},
-    %A3 : tensor<2xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["false"]}
-  %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
-  // CHECK: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
-func @extract_slice_nonmatching_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>, %idx: index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // %r1 bufferizes inplace because %A is inplaceable.
-  // %r0 is an overlapping tensor.extract_slice that does not match, it must be
-  // out of place.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // %r1 can bufferize inplace fine.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // %r3 does bufferizes inplace because %B is not inplaceable.
-  // %r0 is an overlapping tensor.extract_slice that does not match, but does
-  // not alias with the buffer coming from %r3 so it can actually bufferize
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // %r3 cannot bufferize inplace since %B is not inplaceable.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_matching_insert_slice
-func @extract_slice_matching_insert_slice(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // %r1 bufferizes inplace because %A is inplaceable.
-  // %r0 is a tensor.extract_slice that matches, it can also be bufferized
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
-  // inplace.
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  // tensor.insert_slice cannot bufferize inplace.
-  // This should have been captured by a canonicalization pattern and it would
-  // be unproductive to have special logic in bufferization to encode matching
-  // insert_slice(extract_slice(A), A).
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_linalg_readonly_use
-func @extract_slice_linalg_readonly_use(
-    %A : tensor<?x?xf32>,
-    %B : tensor<4x4xf32>,
-    %C : tensor<4x4xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // tensor.extract_slice is only used as a read, no interference irrespective
-  // of user's inplace status.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // matmul output operand is not inplaceable at the function boundary.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%B: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // matmul output operand is inplaceable at the function boundary.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%C: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
-    %A : tensor<4x4xf32>,
-    %B : tensor<?x?xf32>,
-    %C : tensor<?x?xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // Step 3. %sB forward propagates to a write in %D but it is not inplace.
-  // So this is only ever read and can bufferize inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 2. %sB has a read interference in %E, it does not bufferize inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // Step 4. %sC forward propagates to an inplace write in %E.
-  // %sC backward propagates to %C which is inplaceable.
-  // As a consequence this is bufferized inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Transitive cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
-    %A : tensor<4x4xf32>,
-    %B : tensor<?x?xf32>,
-    %C : tensor<?x?xf32> {linalg.inplaceable = true})
-  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
-{
-  // Step 4. %sB forward propagates to an inplace write in %D.
-  // %sB backward propagates to %B which is not inplaceable.
-  // As a consequence this is bufferized out of place.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 1. %sB backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%sB: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  // Step 3. %sC forward propagates to an inplace write in %E.
-  // %sC backward propagates to %C which is inplaceable.
-  // As a consequence this is bufferized inplace.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
-  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
-  // considered an interference. This bufferizes inplace.
-  //     CHECK: linalg.matmul
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
-                     outs(%sC: tensor<4x4xf32>)
-    -> tensor<4x4xf32>
-
-  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @nested_extract_slice_and_insert
-func @nested_extract_slice_and_insert(
-    %A : tensor<?x?xf32>,
-    %B : tensor<?x?xf32> {linalg.inplaceable = true},
-    %C : tensor<?x?xf32> {linalg.inplaceable = true},
-    %idx : index)
-  ->  (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
-{
-  %f0 = constant 0.0 : f32
-
-  // 2-level matching tensor.extract_slice / tensor.insert_slice into non
-  // inplaceable %A.
-  //   - %rA is not inplaceable because %A is not inplaceable at function boundary.
-  //   - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
-  //   - this propagates to %FA and %ssA being inplaceable.
-  //   - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
-  //     inplaceable and so %sA is not inplaceable.
-  //     CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
-  %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  // 3-level matching tensor.extract_slice / tensor.insert_slice into
-  // inplaceable %B.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
-  // clobbering analysis conservatively test for equivalent buffers.
-  // TODO: This is currently too restrictive and misses clobberings.
-  // When available, use container-containee analysis.
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
-  %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
-  %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
-  %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
-  %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  // 2-level matching tensor.extract_slice / tensor.insert_slice into
-  // inplaceable %C with a twist.
-  // Throw a wrench in the system: %rsC production sizes do not match %ssC.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // The tensor.insert_slice that would be candidate for matching does not actually
-  // match. That tensor.insert_slice can still be bufferized inplace nonetheless
-  // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
-  // CHECK-NEXT: tensor.extract_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
-  // CHECK-NEXT: fill
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT: tensor.insert_slice
-  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
-  %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-  %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
-  %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
-  %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func @scf_for_yield_only(%A : tensor<?xf32>,
-                         %B : tensor<?xf32> {linalg.inplaceable = true},
-                         %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func @scf_for_with_tensor.insert_slice(%A : tensor<?xf32>,
-              %B : tensor<?xf32> {linalg.inplaceable = true},
-              %C : tensor<4xf32>,
-              %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // scf.for bbArgs are always inplaceable seen from ops inside the body:
-  //   1. Either the matching tensor is not inplaceable and an alloc occurs
-  //      which makes bbArg inplaceable.
-  //   2. Or it is already inplaceable and so is bbArg.
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
-  // CHECK-NEXT:   scf.yield
-  // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]}
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
deleted file mode 100644
index e217a7062a94f..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
+++ /dev/null
@@ -1,353 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize -split-input-file | FileCheck %s
-
-// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @fill_inplace(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: tensor<?xf32> {linalg.inplaceable = true})
-func @fill_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}) -> tensor<?xf32> {
-  //     CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref<?xf32, #[[$map_2d_dyn]]>
-
-  //     CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
-  %f0 = constant 0.0 : f32
-
-  /// Inplaceable, no alloc
-  // CHECK-NOT: alloc
-  //     CHECK: linalg.fill(%[[F0]], %[[I]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
-  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //     CHECK:  %[[R:.*]] = memref.tensor_load %[[I]] : memref<?xf32, #[[$map_2d_dyn]]>
-  //     CHECK:  return %[[R]] : tensor<?xf32>
-  return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-/// No linalg.inplaceable flag, must allocate.
-// CHECK-LABEL: func @not_inplace(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: tensor<?xf32>)
-func @not_inplace(%A : tensor<?xf32>) -> tensor<?xf32> {
-  //     CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref<?xf32, #[[$map_2d_dyn]]>
-
-  //     CHECK: %[[D0:.*]] = memref.dim %[[I]], {{.*}} : memref<?xf32, #[[$map_2d_dyn]]>
-  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
-  //     CHECK: %[[I2:.*]] = memref.cast %[[ALLOC]] : memref<?xf32> to memref<?xf32, #map>
-
-  //     CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
-  %f0 = constant 0.0 : f32
-
-  //     CHECK: linalg.fill(%[[F0]], %[[I2]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
-  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //     CHECK:  dealloc %[[ALLOC]] : memref<?xf32>
-  //     CHECK:  %[[R:.*]] = memref.tensor_load %[[I2]] : memref<?xf32, #[[$map_2d_dyn]]>
-  //     CHECK:  return %[[R]] : tensor<?xf32>
-  return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @not_inplace
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: tensor<?x?xf32>
-func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
-  %f0 = constant 0.0 : f32
-
-  //       CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast %[[A]] : memref<?x?xf32
-
-  /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
-  //       CHECK: %[[ALLOC:.*]] = memref.alloc
-  //       CHECK: %[[CAST:.*]] = memref.cast %[[ALLOC]]
-  //       CHECK: linalg.fill({{.*}}, %[[CAST]]
-  %f = linalg.fill(%f0, %A) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-
-  /// The second op has no interfering reads and can reuse.
-  //   CHECK-NOT: alloc
-  //       CHECK: linalg.matmul{{.*}}outs(%[[BUFFER_CAST]]
-  %r = linalg.matmul  ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%A: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
-  return %r: tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @not_inplace
-func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
-  /// Within op multiple uses of %A, must alloc.
-  // CHECK: alloc
-  %r = linalg.matmul  ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%A: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
-  return %r: tensor<?x?xf32>
-}
-// -----
-
-// CHECK-LABEL: func @vec_inplace
-func @vec_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
-    -> tensor<?xf32>
-{
-  %c0 = constant 0 : index
-  // CHECK-NOT: alloc
-  %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
-  return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @vec_not_inplace
-func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
-    -> (tensor<?xf32>, tensor<?xf32>)
-{
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-
-  //       CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast {{.*}} : memref<?xf32, #[[$map_2d_dyn]]>
-
-  /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
-  //      CHECK: %[[ALLOC:.*]] = memref.alloc
-  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
-  %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
-
-  /// The second vector.transfer has no interfering reads and can reuse the buffer.
-  //  CHECK-NOT: alloc
-  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[BUFFER_CAST]]
-  %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A0 : tensor<?xf32>, %A1 : tensor<?xf32> {linalg.inplaceable = true},
-                           %t0 : tensor<4xf32>, %t1 : tensor<4xf32> {linalg.inplaceable = true})
-  ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: %[[BUFFER_CAST_A0:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //      CHECK: %[[BUFFER_CAST_A1:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //      CHECK: %[[BUFFER_CAST_t0:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-  //      CHECK: %[[BUFFER_CAST_t1:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
-  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A0:.*]] = memref.alloc
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
-  //      CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A0]])
-  %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
-  //      CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A0_2]])
-  %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  //  Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_A1]]
-  //      CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A1]])
-  %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  //  Do not realloc the large tensor. Copy the tensor.extract_slice.
-  //  CHECK-NOT: alloc
-  //      CHECK: %[[SV_A1_2:.*]] = memref.subview %[[BUFFER_CAST_A1]]
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A1_2]])
-  %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
-  -> tensor<?xf32>
-{
-  %f0 = constant 0.0 : f32
-
-  //      CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //      CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
-  //  CHECK-NOT: alloc
-  //      CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
-  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  /// Overwrite BUFFER_CAST_A inplace.
-  //      CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
-  %r1 = linalg.fill(%f0, %r0) : f32, tensor<?xf32> -> tensor<?xf32>
-  return %r1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
-  -> tensor<?xf32>
-{
-  %f0 = constant 0.0 : f32
-
-  //      CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //      CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
-  //      CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
-  %r0 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //  CHECK-NOT: alloc
-  //      CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
-  /// Overwrite BUFFER_CAST_A inplace by copying into the subview.
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
-  %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  return %r1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun_not_inplace
-func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
-  -> tensor<?xf32>
-{
-  //      CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //      CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
-  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
-  //      CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
-  //      CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
-  //      CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
-  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-  return %r0: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun_not_inplace
-func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  %f0 = constant 0.0 : f32
-
-  //  CHECK-DAG: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32{{.*}}
-  //  CHECK-DAG: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32{{.*}}
-
-  // tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace
-  //  CHECK-DAG: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1] : memref<?xf32, {{.*}}> to memref<4xf32, {{.*}}>
-  //  CHECK-DAG: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}>
-  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-  // fill would interfere with %r0 that is also being returned.
-  // So we need to bufferize it out of place and make a new alloc.
-  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
-  //  CHECK-DAG: %[[ALLOC_CAST_DYNAMIC:.*]] = memref.cast %[[ALLOC]] : memref<?xf32> to memref<?xf32, {{.*}}
-  //      CHECK: linalg.fill(%{{.*}}, %[[ALLOC_CAST_DYNAMIC]]
-  //      CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
-  %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //  CHECK-DAG: %[[RET_A:.*]] = memref.tensor_load %[[BUFFER_CAST_A]] : memref<?xf32, {{.*}}
-  //  CHECK-DAG: %[[RET_B:.*]] = memref.tensor_load %[[ALLOC_CAST_DYNAMIC]] : memref<?xf32, {{.*}}
-  //      CHECK: return %[[RET_B]], %[[RET_A]]
-  return %r1, %r0: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_fun
-func @extract_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
-  ->  tensor<4xf32>
-{
-  // This bufferizes to a pattern that the cross-function boundary pass needs to
-  // convert into a new memref argument at all call site; this may be either:
-  //   - an externally created aliasing subview (if we want to allow aliasing
-  //     function arguments).
-  //   - a new alloc + copy (more expensive but does not create new function
-  //     argument aliasing).
-  // CHECK-NOT: alloc
-  // CHECK-NOT: copy
-  //     CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
-  //     CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1]
-  //     CHECK: %[[RES:.*]] = memref.tensor_load %[[SV]]
-  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
-  //     CHECK: return %[[RES]]
-  return %r0: tensor<4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func @scf_for_yield_only(%A : tensor<?xf32>,
-                         %B : tensor<?xf32> {linalg.inplaceable = true},
-                         %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
-  //     CHECK:   %[[BUFFER_CAST_A:.*]] = memref.buffer_cast
-  //     CHECK:   %[[BUFFER_CAST_B:.*]] = memref.buffer_cast
-  //     CHECK:   linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]])
-
-  // The first scf.for remains but just turns into dead code.
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // The second scf.for remains but just turns into dead code.
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // Cross function call alloc/dealloc pattern must be hoist out.
-  //     CHECK:   memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
-  //     CHECK:   %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]]
-  // Returning tensor_load of the buffer cast makes the %r1 loop dead.
-  //     CHECK:   %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B:.*]]
-  //     CHECK:   return %[[rA]], %[[rB]] : tensor<?xf32>, tensor<?xf32>
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func @scf_for_with_tensor.insert_slice(
-   %A : tensor<?xf32>,
-              %B : tensor<?xf32> {linalg.inplaceable = true},
-              %C : tensor<4xf32>,
-              %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
-  //     CHECK:   %[[BUFFER_CAST_A:.*]] = memref.buffer_cast
-  //     CHECK:   %[[BUFFER_CAST_B:.*]] = memref.buffer_cast
-  //     CHECK:   %[[BUFFER_CAST_C:.*]] = memref.buffer_cast
-  //     CHECK:   linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]])
-
-  //     CHECK:   scf.for {{.*}} iter_args(%[[bbA:.*]] = %{{.*}}, %[[bbB:.*]] = %{{.*}})
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    //     CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
-    // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
-    //     CHECK: linalg.copy(%[[BUFFER_CAST_C]], %[[svA]])
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
-    //     CHECK:   %[[svB:.*]] = memref.subview %[[BUFFER_CAST_B]][0] [4] [1]
-    //     CHECK:   linalg.copy(%[[BUFFER_CAST_C]], %[[svB]])
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // Yielding bbA and bbB will canonicalize away into oblivion.
-    //     CHECK:   scf.yield %[[bbA]], %[[bbB]] : tensor<?xf32>, tensor<?xf32>
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  //     CHECK:  memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
-  //     CHECK:  %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]] : memref<?xf32>
-  //     CHECK:  %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B]] : memref<?xf32, #map>
-  //     CHECK:  return %[[rA]], %[[rB]] : tensor<?xf32>, tensor<?xf32>
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
index 108119467ea63..a580cbb36060f 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -1,5 +1,483 @@
 // RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s
 
+//===----------------------------------------------------------------------===//
+// Simple cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_fun
+func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
+  -> (tensor<4xf32>, tensor<8xf32>)
+{
+  // tensor.extract_slice is not used in a write, it is not compelled to
+  // bufferize out of place. Let callers decide whether they want to create
+  // aliasing subviews at all call sites or whether they allocate.
+  // This is true irrespective of whether the function argument is inplaceable.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
+
+  return %r0, %r1: tensor<4xf32>, tensor<8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_fun
+func @insert_slice_fun(
+    %A : tensor<?xf32>,
+    %B : tensor<?xf32> {linalg.inplaceable = true},
+    %C : tensor<4xf32>)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  // must bufferize out of place.
+  //     CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // bufferizes inplace.
+  //     CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @conflict_on_B
+func @conflict_on_B(
+    %A : tensor<4x4xf32> {linalg.inplaceable = true},
+    %B : tensor<4x4xf32> {linalg.inplaceable = true})
+  -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
+{
+  // matmul output operand interferes with input operand.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %C = linalg.matmul  ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%B: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  // matmul output operand interferes with input operand.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %D = linalg.matmul  ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%B: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  // matmul output operand does not interferes with input operand.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%B: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Length-1 producer-consumer cases.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_extract_slice
+func @extract_slice_extract_slice(
+    %A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
+  -> (tensor<2xf32>, tensor<2xf32>)
+{
+  // tensor.extract_slice is not used in a write, it is not compelled to
+  // bufferize out of place. Let callers decide whether they want to create
+  // aliasing subviews at all call sites or whether they allocate.
+  // This is true irrespective of whether the function argument is inplaceable.
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
+
+  return %r1, %r3: tensor<2xf32>, tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_insert_slice
+func @insert_slice_insert_slice(
+    %A : tensor<?xf32> {linalg.inplaceable = true},
+    %A2 : tensor<4xf32> {linalg.inplaceable = true},
+    %A3 : tensor<2xf32> {linalg.inplaceable = true},
+    %B : tensor<?xf32>, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
+
+  // CHECK: {__inplace_results_attr__ = ["false"]}
+  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
+func @extract_slice_nonmatching_insert_slice(
+    %A : tensor<?xf32> {linalg.inplaceable = true},
+    %B : tensor<?xf32>, %idx: index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  // %r1 bufferizes inplace because %A is inplaceable.
+  // %r0 is an overlapping tensor.extract_slice that does not match, it must be
+  // out of place.
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // %r1 can bufferize inplace fine.
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // %r3 does bufferizes inplace because %B is not inplaceable.
+  // %r0 is an overlapping tensor.extract_slice that does not match, but does
+  // not alias with the buffer coming from %r3 so it can actually bufferize
+  // inplace.
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // %r3 cannot bufferize inplace since %B is not inplaceable.
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_matching_insert_slice
+func @extract_slice_matching_insert_slice(
+    %A : tensor<?xf32> {linalg.inplaceable = true},
+    %B : tensor<?xf32>)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  // %r1 bufferizes inplace because %A is inplaceable.
+  // %r0 is a tensor.extract_slice that matches, it can also be bufferized
+  // inplace.
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
+  // inplace.
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // tensor.insert_slice cannot bufferize inplace.
+  // This should have been captured by a canonicalization pattern and it would
+  // be unproductive to have special logic in bufferization to encode matching
+  // insert_slice(extract_slice(A), A).
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_linalg_readonly_use
+func @extract_slice_linalg_readonly_use(
+    %A : tensor<?x?xf32>,
+    %B : tensor<4x4xf32>,
+    %C : tensor<4x4xf32> {linalg.inplaceable = true})
+  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+  // tensor.extract_slice is only used as a read, no interference irrespective
+  // of user's inplace status.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+  // matmul output operand is not inplaceable at the function boundary.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %D = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%B: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  // matmul output operand is inplaceable at the function boundary.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %E = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%C: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_to_linalg_write_use
+func @extract_slice_to_linalg_write_use(
+    %A : tensor<4x4xf32>,
+    %B : tensor<?x?xf32>,
+    %C : tensor<?x?xf32> {linalg.inplaceable = true})
+  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+  // Step 3. %sB forward propagates to a write in %D but it is not inplace.
+  // So this is only ever read and can bufferize inplace.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+  // Step 2. %sB has a read interference in %E, it does not bufferize inplace.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%sB: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  // Step 4. %sC forward propagates to an inplace write in %E.
+  // %sC backward propagates to %C which is inplaceable.
+  // As a consequence this is bufferized inplace.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
+  // considered an interference. This bufferizes inplace.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %E = linalg.matmul  ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%sC: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Transitive cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_to_linalg_write_use
+func @extract_slice_to_linalg_write_use(
+    %A : tensor<4x4xf32>,
+    %B : tensor<?x?xf32>,
+    %C : tensor<?x?xf32> {linalg.inplaceable = true})
+  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+  // Step 4. %sB forward propagates to an inplace write in %D.
+  // %sB backward propagates to %B which is not inplaceable.
+  // As a consequence this is bufferized out of place.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+  // Step 1. %sB backprops to the tensor.extract_slice producer which is not
+  // considered an interference. This bufferizes inplace.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%sB: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  // Step 3. %sC forward propagates to an inplace write in %E.
+  // %sC backward propagates to %C which is inplaceable.
+  // As a consequence this is bufferized inplace.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
+  // considered an interference. This bufferizes inplace.
+  //     CHECK: linalg.matmul
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+                     outs(%sC: tensor<4x4xf32>)
+    -> tensor<4x4xf32>
+
+  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @nested_extract_slice_and_insert
+func @nested_extract_slice_and_insert(
+    %A : tensor<?x?xf32>,
+    %B : tensor<?x?xf32> {linalg.inplaceable = true},
+    %C : tensor<?x?xf32> {linalg.inplaceable = true},
+    %idx : index)
+  ->  (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
+{
+  %f0 = constant 0.0 : f32
+
+  // 2-level matching tensor.extract_slice / tensor.insert_slice into non
+  // inplaceable %A.
+  //   - %rA is not inplaceable because %A is not inplaceable at function boundary.
+  //   - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
+  //   - this propagates to %FA and %ssA being inplaceable.
+  //   - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
+  //     inplaceable and so %sA is not inplaceable.
+  //     CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  // CHECK-NEXT: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+  %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+  %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
+  %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+  // 3-level matching tensor.extract_slice / tensor.insert_slice into
+  // inplaceable %B.
+  // CHECK-NEXT: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.extract_slice
+  // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
+  // clobbering analysis conservatively test for equivalent buffers.
+  // TODO: This is currently too restrictive and misses clobberings.
+  // When available, use container-containee analysis.
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  // CHECK-NEXT: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
+  %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
+  %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+  %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
+  %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
+  %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+  // 2-level matching tensor.extract_slice / tensor.insert_slice into
+  // inplaceable %C with a twist.
+  // Throw a wrench in the system: %rsC production sizes do not match %ssC.
+  // CHECK-NEXT: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // The tensor.insert_slice that would be candidate for matching does not actually
+  // match. That tensor.insert_slice can still be bufferized inplace nonetheless
+  // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
+  // CHECK-NEXT: tensor.extract_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  // CHECK-NEXT: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT: tensor.insert_slice
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+  %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+  %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
+  %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+  return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Simple loop cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @scf_for_yield_only
+func @scf_for_yield_only(%A : tensor<?xf32>,
+                         %B : tensor<?xf32> {linalg.inplaceable = true},
+                         %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //      CHECK: scf.for
+  // CHECK-NEXT: scf.yield
+  // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //      CHECK: scf.for
+  // CHECK-NEXT: scf.yield
+  // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
+  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+func @scf_for_with_tensor.insert_slice(%A : tensor<?xf32>,
+              %B : tensor<?xf32> {linalg.inplaceable = true},
+              %C : tensor<4xf32>,
+              %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //      CHECK: scf.for
+  // scf.for bbArgs are always inplaceable seen from ops inside the body:
+  //   1. Either the matching tensor is not inplaceable and an alloc occurs
+  //      which makes bbArg inplaceable.
+  //   2. Or it is already inplaceable and so is bbArg.
+  // CHECK-NEXT:   tensor.insert_slice
+  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT:   tensor.insert_slice
+  // CHECK-SAME:     {__inplace_results_attr__ = ["true"]}
+  // CHECK-NEXT:   scf.yield
+  // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]}
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+  }
+
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Cross function boundary cases.
+//===----------------------------------------------------------------------===//
+
 func private @foo(tensor<64xf32>)
 
 // CHECK-LABEL: dependence_through_call

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
index d6a6d7c67f6cf..78f84cc8540c4 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
@@ -44,3 +44,44 @@ func @bar() {
   call @foo() : () -> ()
   return
 }
+
+// -----
+
+func @scf_for(%A : tensor<?xf32>,
+              %B : tensor<?xf32> {linalg.inplaceable = true},
+              %C : tensor<4xf32>,
+              %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // Throw a wrench in the system by swapping yielded values: this result in a
+    // ping-pong of values at each iteration on which we currently want to fail.
+
+    // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}}
+    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
+  }
+
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+func @extract_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
+  ->  tensor<4xf32>
+{
+  // This bufferizes to a pattern that the cross-function boundary pass needs to
+  // convert into a new memref argument at all call site; this may be either:
+  //   - an externally created aliasing subview (if we want to allow aliasing
+  //     function arguments).
+  //   - a new alloc + copy (more expensive but does not create new function
+  //     argument aliasing).
+  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+  // expected-error @+1 {{buffer result #0 not produced by an alloc}}
+  return %r0: tensor<4xf32>
+}

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index b71f6f92d51ed..bc6488bca8e58 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -1,5 +1,355 @@
 // RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file | FileCheck %s
 
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @fill_inplace(
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @fill_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}) -> tensor<?xf32> {
+  //     CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
+  %f0 = constant 0.0 : f32
+
+  /// Inplaceable, no alloc
+  // CHECK-NOT: alloc
+  //     CHECK: linalg.fill(%[[F0]], %[[A]]) : f32, memref<?xf32, #[[$map_1d_dyn]]>
+  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+/// No linalg.inplaceable flag, must allocate.
+// CHECK-LABEL: func @not_inplace(
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>) -> memref<?xf32> {
+func @not_inplace(%A : tensor<?xf32>) -> tensor<?xf32> {
+  //     CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
+  %f0 = constant 0.0 : f32
+
+  //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$map_1d_dyn]]>
+  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
+  //     CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref<?xf32>
+  %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //     CHECK:  dealloc %[[ALLOC]] : memref<?xf32>
+  //     CHECK:  return %[[ALLOC]] : memref<?xf32>
+  return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+
+// CHECK-LABEL: func @not_inplace
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, #[[$map_2d_dyn]]>) {
+func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
+  %f0 = constant 0.0 : f32
+
+  /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
+  //       CHECK: %[[ALLOC:.*]] = memref.alloc
+  //       CHECK: linalg.fill({{.*}}, %[[ALLOC]]
+  %f = linalg.fill(%f0, %A) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+
+  /// The second op has no interfering reads and can reuse.
+  //   CHECK-NOT: alloc
+  //       CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
+  %r = linalg.matmul  ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%A: tensor<?x?xf32>)
+    -> tensor<?x?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r: tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @not_inplace
+func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
+  /// Within op multiple uses of %A, must alloc.
+  // CHECK: alloc
+  %r = linalg.matmul  ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%A: tensor<?x?xf32>)
+    -> tensor<?x?xf32>
+  return %r: tensor<?x?xf32>
+}
+// -----
+
+// CHECK-LABEL: func @vec_inplace
+func @vec_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
+    -> tensor<?xf32>
+{
+  %c0 = constant 0 : index
+
+  // CHECK-NOT: alloc
+  %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @vec_not_inplace
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
+    -> (tensor<?xf32>, tensor<?xf32>)
+{
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+
+  /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
+  //      CHECK: %[[ALLOC:.*]] = memref.alloc
+  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
+  %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
+
+  /// The second vector.transfer has no interfering reads and can reuse the buffer.
+  //  CHECK-NOT: alloc
+  // CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
+  %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+//  CHECK-SAME:   %[[A0:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
+//  CHECK-SAME:   %[[A1:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
+//  CHECK-SAME:   %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>,
+//  CHECK-SAME:   %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A0 : tensor<?xf32>,
+                       %A1 : tensor<?xf32> {linalg.inplaceable = true},
+                       %t0 : tensor<4xf32>,
+                       %t1 : tensor<4xf32> {linalg.inplaceable = true})
+  ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
+{
+  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC_A0:.*]] = memref.alloc
+  //      CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]]
+  //      CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
+  //      CHECK: linalg.copy(%[[t0]], %[[SV_A0]])
+  %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
+  //      CHECK: linalg.copy(%[[A0]]
+  //      CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
+  //      CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]])
+  %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  //  Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
+  //      CHECK: linalg.copy(%[[A1]]
+  //      CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
+  //      CHECK: linalg.copy(%[[t0]], %[[SV_A1]])
+  %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  //  Do not realloc the large tensor. Copy the tensor.extract_slice.
+  //  CHECK-NOT: alloc
+  //      CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]]
+  //      CHECK: linalg.copy(%[[t1]], %[[SV_A1_2]])
+  %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  //      CHECK: return %[[REALLOC_A0]], %[[REALLOC_A0_2]], %[[REALLOC_A1]] :
+  // CHECK-SAME:   memref<?xf32>, memref<?xf32>, memref<?xf32>
+  return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+  -> tensor<?xf32>
+{
+  %f0 = constant 0.0 : f32
+
+  //  CHECK-NOT: alloc
+  //      CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
+  //      CHECK: linalg.copy(%[[t]], %[[SV_A]])
+  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  /// Overwrite A inplace.
+  //      CHECK: linalg.fill({{.*}}, %[[A]]
+  %r1 = linalg.fill(%f0, %r0) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r1: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+  -> tensor<?xf32>
+{
+  %f0 = constant 0.0 : f32
+
+  //      CHECK: linalg.fill({{.*}}, %[[A]]
+  %r0 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //  CHECK-NOT: alloc
+  //      CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
+  /// Overwrite A inplace by copying into the subview.
+  //      CHECK: linalg.copy(%[[t]], %[[SV_A]])
+  %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  //     CHECK: return
+  // CHECK-NOT: tensor
+  return %r1: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun_not_inplace
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
+  -> tensor<?xf32>
+{
+  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
+  //      CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
+  //      CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
+  //      CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
+  //      CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
+  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  //     CHECK: return %{{.*}} : memref<?xf32>
+  return %r0: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun_not_inplace
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  %f0 = constant 0.0 : f32
+
+  // tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace
+  //  CHECK-DAG: %[[SV_A:.*]] = memref.subview %[[A]][0] [4] [1] : memref<?xf32, {{.*}}> to memref<4xf32, {{.*}}>
+  //  CHECK-DAG: linalg.copy(%[[t]], %[[SV_A]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}>
+  %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+  // fill would interfere with %r0 that is also being returned.
+  // So we need to bufferize it out of place and make a new alloc.
+  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
+  //      CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]
+  %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //      CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
+  //      CHECK: return %[[ALLOC]] : memref<?xf32>
+  return %r1, %r0: tensor<?xf32>, tensor<?xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Simple loop cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_yield_only
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @scf_for_yield_only(%A : tensor<?xf32>,
+                         %B : tensor<?xf32> {linalg.inplaceable = true},
+                         %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
+  //     CHECK:   linalg.copy(%[[A]], %[[ALLOC_FOR_A]])
+
+  // The first scf.for remains but just turns into dead code.
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  // The second scf.for remains but just turns into dead code.
+  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //     CHECK:   memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
+  //     CHECK:   return %[[ALLOC_FOR_A]] : memref<?xf32>
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @scf_for_with_tensor.insert_slice(
+   %A : tensor<?xf32>,
+   %B : tensor<?xf32> {linalg.inplaceable = true},
+   %C : tensor<4xf32>,
+   %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
+  //     CHECK:   linalg.copy(%[[A]], %[[ALLOC_FOR_A]])
+
+  //     CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
+  //     CHECK: %[[svB:.*]] = memref.subview %[[B]][0] [4] [1]
+
+  //     CHECK:   scf.for {{.*}}
+  // CHECK-NOT: iter_args
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
+    //     CHECK: linalg.copy(%[[C]], %[[svA]])
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
+    //     CHECK:   linalg.copy(%[[C]], %[[svB]])
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // CHECK-NOT:   scf.yield
+    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+  }
+
+  //     CHECK:  memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
+  //     CHECK:  return %[[ALLOC_FOR_A]] : memref<?xf32>
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Cross function boundary cases.
+//===----------------------------------------------------------------------===//
+
 //      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
 //      CHECK:  func private @some_external_func(memref<?xf32, #[[$DYN_1D_MAP]]>)