[Mlir-commits] [mlir] f4f1cf6 - [mlir][bufferize] Better analysis for return values of CallOps

Wed Apr 6 07:56:06 PDT 2022

Author: Matthias Springer
Date: 2022-04-06T23:54:32+09:00
New Revision: f4f1cf6c31beaf387ae73b0407b30b41438dafde

URL: https://github.com/llvm/llvm-project/commit/f4f1cf6c31beaf387ae73b0407b30b41438dafde
DIFF: https://github.com/llvm/llvm-project/commit/f4f1cf6c31beaf387ae73b0407b30b41438dafde.diff

LOG: [mlir][bufferize] Better analysis for return values of CallOps

Support returning arbitrary tensors from functions. Even those that are
not equivalent. To that end, additional information is gathered during
the analysis phase. In particular, which function args are aliasing with
which return values.

Also fix bugs in the current implementation when returning equivalent
tensors. Various unit tests are added to ensure that we have better test
coverage.

Note: Returning non-equivalent tensors is only allowed when
allowReturnAllocs is enabled. This functionality is useful for unit
testing and compatibility with other bufferizations such as the sparse
compiler. This is also towards using ModuleBufferization as a
replacement for --func-bufferize.

Differential Revision: https://reviews.llvm.org/D119120

Added: 
    mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir
    mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir

Modified: 
    mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
    mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp
    mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index 9d3b3a1105549..859b138110449 100644

--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -77,6 +77,11 @@ class BufferizationAliasInfo {
   /// Set the inPlace bufferization spec to false.
   void bufferizeOutOfPlace(OpOperand &operand);
 
+  /// Return true if `v1` and `v2` may bufferize to aliasing buffers.
+  bool areAliasingBufferizedValues(Value v1, Value v2) const {
+    return aliasInfo.isEquivalent(v1, v2);
+  }
+
   /// Return true if `v1` and `v2` bufferize to equivalent buffers.
   bool areEquivalentBufferizedValues(Value v1, Value v2) const {
     return equivalentInfo.isEquivalent(v1, v2);

diff  --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp
index 3ef695f414631..778199aa25202 100644
--- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp
+++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp
@@ -19,8 +19,9 @@
 // gathered through PostAnalysisStepFns and stored in
 // `ModuleAnalysisState`.
 //
-// * `equivalentFuncOpBBArgsAnalysis` determines the equivalent bbArg for each
-//   tensor return value (if any).
+// * `aliasingFuncOpBBArgsAnalysis` determines the equivalent/aliasing bbArgs
+// for
+//   each tensor return value (if any).
 // * `funcOpBbArgReadWriteAnalysis` determines whether or not a tensor bbArg is
 //   read/written.
 //
@@ -93,16 +94,31 @@ enum class FuncOpAnalysisState { NotAnalyzed, InProgress, Analyzed };
 /// Extra analysis state that is required for bufferization of function
 /// boundaries.
 struct ModuleAnalysisState : public DialectAnalysisState {
+  // Note: Function arguments and/or function return values may disappear during
+  // bufferization. Functions and their CallOps are analyzed and bufferized
+  // separately. To ensure that a CallOp analysis/bufferization can access an
+  // already bufferized function's analysis results, we store bbArg/return value
+  // indices instead of BlockArguments/OpOperand pointers.
+
   /// A set of block argument indices.
   using BbArgIndexSet = DenseSet<int64_t>;
 
   /// A mapping of indices to indices.
   using IndexMapping = DenseMap<int64_t, int64_t>;
 
+  /// A mapping of indices to a list of indices.
+  using IndexToIndexListMapping = DenseMap<int64_t, SmallVector<int64_t>>;
+
   /// A mapping of ReturnOp OpOperand indices to equivalent FuncOp BBArg
   /// indices.
   DenseMap<FuncOp, IndexMapping> equivalentFuncArgs;
 
+  /// A mapping of ReturnOp OpOperand indices to aliasing FuncOp BBArg indices.
+  DenseMap<FuncOp, IndexToIndexListMapping> aliasingFuncArgs;
+
+  /// A mapping of FuncOp BBArg indices to aliasing ReturnOp OpOperand indices.
+  DenseMap<FuncOp, IndexToIndexListMapping> aliasingReturnVals;
+
   /// A set of all read BlockArguments of FuncOps.
   DenseMap<FuncOp, BbArgIndexSet> readBbArgs;
 
@@ -124,13 +140,21 @@ struct ModuleAnalysisState : public DialectAnalysisState {
   void startFunctionAnalysis(FuncOp funcOp) {
     analyzedFuncOps[funcOp] = FuncOpAnalysisState::InProgress;
     auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping());
+    auto createdAliasingOperands =
+        aliasingFuncArgs.try_emplace(funcOp, IndexToIndexListMapping());
+    auto createdAliasingResults =
+        aliasingReturnVals.try_emplace(funcOp, IndexToIndexListMapping());
     auto createdRead = readBbArgs.try_emplace(funcOp, BbArgIndexSet());
     auto createdWritten = writtenBbArgs.try_emplace(funcOp, BbArgIndexSet());
     (void)createdEquiv;
+    (void)createdAliasingOperands;
+    (void)createdAliasingResults;
     (void)createdRead;
     (void)createdWritten;
 #ifndef NDEBUG
     assert(createdEquiv.second && "equivalence info exists already");
+    assert(createdAliasingOperands.second && "aliasing info exists already");
+    assert(createdAliasingResults.second && "aliasing info exists already");
     assert(createdRead.second && "bbarg access info exists already");
     assert(createdWritten.second && "bbarg access info exists already");
 #endif // NDEBUG
@@ -201,12 +225,12 @@ static void annotateEquivalentReturnBbArg(OpOperand &returnVal,
   op->setAttr(kEquivalentArgsAttr, b.getI64ArrayAttr(equivBbArgs));
 }
 
-/// Store function BlockArguments that are equivalent to a returned value in
-/// ModuleAnalysisState.
+/// Store function BlockArguments that are equivalent to/aliasing a returned
+/// value in ModuleAnalysisState.
 static LogicalResult
-equivalentFuncOpBBArgsAnalysis(Operation *op, AnalysisState &state,
-                               BufferizationAliasInfo &aliasInfo,
-                               SmallVector<Operation *> &newOps) {
+aliasingFuncOpBBArgsAnalysis(Operation *op, AnalysisState &state,
+                             BufferizationAliasInfo &aliasInfo,
+                             SmallVector<Operation *> &newOps) {
   ModuleAnalysisState &moduleState = getModuleAnalysisState(state);
 
   // Support only single return-terminated block in the function.
@@ -217,14 +241,20 @@ equivalentFuncOpBBArgsAnalysis(Operation *op, AnalysisState &state,
   for (OpOperand &returnVal : returnOp->getOpOperands())
     if (returnVal.get().getType().isa<RankedTensorType>())
       for (BlockArgument bbArg : funcOp.getArguments())
-        if (bbArg.getType().isa<RankedTensorType>())
+        if (bbArg.getType().isa<RankedTensorType>()) {
+          int64_t returnIdx = returnVal.getOperandNumber();
+          int64_t bbArgIdx = bbArg.getArgNumber();
           if (aliasInfo.areEquivalentBufferizedValues(returnVal.get(), bbArg)) {
-            moduleState
-                .equivalentFuncArgs[funcOp][returnVal.getOperandNumber()] =
-                bbArg.getArgNumber();
+            moduleState.equivalentFuncArgs[funcOp][returnIdx] = bbArgIdx;
             if (state.getOptions().testAnalysisOnly)
               annotateEquivalentReturnBbArg(returnVal, bbArg);
           }
+          if (aliasInfo.areAliasingBufferizedValues(returnVal.get(), bbArg)) {
+            moduleState.aliasingFuncArgs[funcOp][returnIdx].push_back(bbArgIdx);
+            moduleState.aliasingReturnVals[funcOp][bbArgIdx].push_back(
+                returnIdx);
+          }
+        }
 
   return success();
 }
@@ -364,7 +394,8 @@ getBufferizedFunctionType(MLIRContext *ctx, TypeRange argumentTypes,
 }
 
 /// Gather equivalence info of CallOps.
-/// Note: This only adds new equivalence info if `funcOp` was already analyzed.
+/// Note: This only adds new equivalence info if the called function was already
+/// analyzed.
 // TODO: This does not handle cyclic function call graphs etc.
 static void equivalenceAnalysis(FuncOp funcOp,
                                 BufferizationAliasInfo &aliasInfo,
@@ -750,15 +781,23 @@ struct CallOpInterface
     FuncOp funcOp = getCalledFunction(callOp);
     assert(funcOp && "expected CallOp to a FuncOp");
     const ModuleAnalysisState &moduleState = getModuleAnalysisState(state);
+    if (getFuncOpAnalysisState(state, funcOp) !=
+        FuncOpAnalysisState::Analyzed) {
+      // FuncOp not analyzed yet. Any OpResult may be aliasing.
+      SmallVector<OpResult> result;
+      for (OpResult opResult : op->getOpResults())
+        if (opResult.getType().isa<TensorType>())
+          result.push_back(opResult);
+      return result;
+    }
 
+    // Get aliasing results from state.
+    auto aliasingReturnVals =
+        moduleState.aliasingReturnVals.lookup(funcOp).lookup(
+            opOperand.getOperandNumber());
     SmallVector<OpResult> result;
-    for (int64_t resultIdx = 0; resultIdx < callOp->getNumResults();
-         ++resultIdx)
-      if (Optional<int64_t> maybeArgNumber =
-              getEquivalentFuncArgIdx(funcOp, moduleState, resultIdx))
-        if (*maybeArgNumber == opOperand.getOperandNumber())
-          result.push_back(callOp->getOpResult(resultIdx));
-
+    for (int64_t resultIdx : aliasingReturnVals)
+      result.push_back(callOp->getOpResult(resultIdx));
     return result;
   }
 
@@ -769,17 +808,23 @@ struct CallOpInterface
     FuncOp funcOp = getCalledFunction(callOp);
     assert(funcOp && "expected CallOp to a FuncOp");
     const ModuleAnalysisState &moduleState = getModuleAnalysisState(state);
+    if (getFuncOpAnalysisState(state, funcOp) !=
+        FuncOpAnalysisState::Analyzed) {
+      // FuncOp not analyzed yet. Any OpOperand may be aliasing.
+      SmallVector<OpOperand *> result;
+      for (OpOperand &opOperand : op->getOpOperands())
+        if (opOperand.get().getType().isa<TensorType>())
+          result.push_back(&opOperand);
+      return result;
+    }
 
-    // TODO: We should be looking for aliasing block arguments here. The current
-    // condition is actually stronger than neccesary. Once we check for aliasing
-    // block arguments, we may be multiple.
-    if (Optional<int64_t> maybeArgNumber = getEquivalentFuncArgIdx(
-            funcOp, moduleState, opResult.getResultNumber()))
-      return {&op->getOpOperand(*maybeArgNumber)};
-
-    // Note: Returning a non-equivalent tensor from a FuncOp is currently not
-    // supported an will fail bufferization.
-    return {};
+    // Get aliasing bbArgs from state.
+    auto aliasingFuncArgs = moduleState.aliasingFuncArgs.lookup(funcOp).lookup(
+        opResult.getResultNumber());
+    SmallVector<OpOperand *> result;
+    for (int64_t bbArgIdx : aliasingFuncArgs)
+      result.push_back(&callOp->getOpOperand(bbArgIdx));
+    return result;
   }
 
   BufferRelation bufferRelation(Operation *op, OpResult opResult,
@@ -799,6 +844,8 @@ struct CallOpInterface
     assert(funcOp && "expected CallOp to a FuncOp");
     const ModuleAnalysisState &moduleState =
         getModuleAnalysisState(state.getAnalysisState());
+    const OneShotBufferizationOptions &options =
+        static_cast<const OneShotBufferizationOptions &>(state.getOptions());
 
     // Result types of the bufferized CallOp.
     SmallVector<Type> resultTypes;
@@ -850,8 +897,16 @@ struct CallOpInterface
         continue;
       }
 
-      return callOp->emitError(
-          "call to FuncOp that returns non-equivalent tensors not supported");
+      if (!options.allowReturnAllocs)
+        return callOp->emitError(
+            "call to FuncOp that returns non-equivalent tensors not supported");
+
+      // Returning a memref. This memref is not equivalent to any bbArg. It is
+      // likely a newly allocated buffer. We may want to hoist such allocations
+      // to the call site in the future.
+      retValMapping[returnValIdx] = resultTypes.size();
+      resultTypes.push_back(
+          funcOp.getFunctionType().getResult(resultTypes.size()));
     }
 
     // 2. Compute bufferized FunctionType.
@@ -859,7 +914,7 @@ struct CallOpInterface
     // Get the bufferized FunctionType for funcOp or construct it if not yet
     // available.
     FunctionType bufferizedFuncType = getBufferizedFunctionType(
-        funcOp.getContext(), argumentTypes, resultTypes, state.getOptions());
+        funcOp.getContext(), argumentTypes, resultTypes, options);
 
     // 3. Rewrite tensor operands as memrefs based on `bufferizedFuncType`.
     for (OpOperand &opOperand : callOp->getOpOperands()) {
@@ -1021,7 +1076,7 @@ LogicalResult mlir::linalg::comprehensive_bufferize::runModuleBufferize(
     return failure();
 
   // Collect bbArg/return value information after the analysis.
-  options.addPostAnalysisStep(equivalentFuncOpBBArgsAnalysis);
+  options.addPostAnalysisStep(aliasingFuncOpBBArgsAnalysis);
   options.addPostAnalysisStep(funcOpBbArgReadWriteAnalysis);
 
   // Analyze ops.

diff  --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index 563e22f5d6a45..b8f9dcf0149b8 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -406,84 +406,6 @@ func @scf_for_with_tensor.insert_slice(
 // Cross function boundary cases.
 //===----------------------------------------------------------------------===//
 
-//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-//      CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]>
-//      CHECK: func private @some_external_func(memref<4xi32, #[[$DYN_1D_MAP]]>)
-func private @some_external_func(tensor<4xi32>)
-
-//      CHECK: func @main()
-func @main() {
-//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
-  %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-
-//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
-//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
-//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
-//      CHECK:   call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
-  call @some_external_func(%A) : (tensor<4xi32>) -> ()
-
-//      CHECK: memref.dealloc %[[alloc]]
-  return
-}
-
-// -----
-
-//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-//      CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]>
-//      CHECK: func private @some_external_func_within_scf_execute(memref<4xi32, #[[$DYN_1D_MAP]]>)
-func private @some_external_func_within_scf_execute(tensor<4xi32>)
-
-//      CHECK: func @main()
-func @main() {
-//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
-  %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-
-//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
-//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
-//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
-//      CHECK:   call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
-  scf.execute_region {
-    call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> ()
-    scf.yield
-  }
-
-//      CHECK:   memref.dealloc %[[alloc]]
-  return
-}
-
-// -----
-
-// CHECK: func private @external_func_with_return_val(memref<4xi32, #{{.*}}>) -> f32
-func private @external_func_with_return_val(tensor<4xi32>) -> f32
-
-// -----
-
-// CHECK-LABEL: func @execute_region_test(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func @execute_region_test(%t1 : tensor<?xf32> {linalg.inplaceable = "true"})
-    -> (f32, tensor<?xf32>, f32)
-{
-  %f1 = arith.constant 0.0 : f32
-  %f2 = arith.constant 1.0 : f32
-  %idx = arith.constant 7 : index
-
-  // scf.execute_region is canonicalized away after bufferization. So just the
-  // memref.store is left over.
-
-  // CHECK: memref.store %{{.*}}, %[[m1]][%{{.*}}]
-  %0, %1, %2 = scf.execute_region -> (f32, tensor<?xf32>, f32) {
-    %t2 = tensor.insert %f2 into %t1[%idx] : tensor<?xf32>
-    scf.yield %f1, %t2, %f2 : f32, tensor<?xf32>, f32
-  }
-
-  // CHECK: return %{{.*}}, %{{.*}} : f32, f32
-  return %0, %1, %2 : f32, tensor<?xf32>, f32
-}
-
-// -----
-
 // CHECK-LABEL: func @execute_region_with_conflict(
 //  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
 func @execute_region_with_conflict(%t1 : tensor<?xf32> {linalg.inplaceable = "true"})
@@ -513,193 +435,6 @@ func @execute_region_with_conflict(%t1 : tensor<?xf32> {linalg.inplaceable = "tr
 
 // -----
 
-//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-//      CHECK:  func private @some_external_func(memref<?xf32, #[[$DYN_1D_MAP]]>)
-func private @some_external_func(tensor<?xf32>)
-
-//      CHECK:  func @scf_for_with_tensor_insert_slice(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]>
-func @scf_for_with_tensor_insert_slice(
-    %A : tensor<?xf32>, %B : tensor<?xf32>, %C : tensor<4xf32>,
-    %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  // CHECK-NEXT: scf.for
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    // CHECK-NEXT:   %[[SVA:.*]] = memref.subview %[[A]]
-    // CHECK-NEXT:   memref.copy %[[C]], %[[SVA]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]>
-    %ttA = tensor.insert_slice %C into %tA[%i][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // CHECK-NEXT:   %[[SVB:.*]] = memref.subview %[[B]]
-    // CHECK-NEXT:   memref.copy %[[C]], %[[SVB]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]>
-    %ttB = tensor.insert_slice %C into %tB[%i][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // scf.yield is empty and is elided
-    //  CHECK-NOT:   scf.yield
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  // Swaparoo requires bufferizing the whole function to figure out who's who.
-  return %r0#1, %r0#0: tensor<?xf32>, tensor<?xf32>
-}
-
-//      CHECK:  func @bar(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]>
-func @bar(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %B : tensor<?xf32> {linalg.inplaceable = true},
-    %C : tensor<4xf32> {linalg.inplaceable = true},
-    %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-//  CHECK-DAG:   call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]]
-  %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) :
-      (tensor<?xf32>, tensor<?xf32>, tensor<4xf32>, index, index, index)
-        -> (tensor<?xf32>, tensor<?xf32>)
-
-  // %r0#0 requires a copy because we have no idea what the function is doing.
-//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
-//  CHECK-DAG:   %[[casted:.*]] = memref.cast %[[alloc]]
-//      CHECK:   memref.copy %[[B]], %[[alloc]]
-// CHECK-NEXT:   call @some_external_func(%[[casted]]) : (memref<?xf32, #[[$DYN_1D_MAP]]>) -> ()
-  call @some_external_func(%r0#0) : (tensor<?xf32>) -> ()
-
-//      CHECK:   return
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-//  CHECK-DAG: #[[$DYN_0D_MAP:.*]] = affine_map<()[s0] -> (s0)>
-//  CHECK-DAG: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-//      CHECK:  func @init_and_dot(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]>
-// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<f32, #[[$DYN_0D_MAP]]>
-func @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor<f32>) -> tensor<f32> {
-  // CHECK-NEXT:   %[[C0:.*]] = arith.constant 0{{.*}} : f32
-  %v0 = arith.constant 0.0 : f32
-
-  // CHECK-NEXT:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32, #[[$DYN_0D_MAP]]>)
-  %d = linalg.fill ins(%v0 : f32) outs(%c : tensor<f32>) -> tensor<f32>
-
-  // CHECK-NEXT:   linalg.dot ins(%[[A]], %[[B]] : memref<64xf32, #[[$DYN_1D_MAP]]>, memref<64xf32, #[[$DYN_1D_MAP]]>) outs(%[[C]] : memref<f32, #[[$DYN_0D_MAP]]>)
-  %e = linalg.dot ins(%a, %b : tensor<64xf32>,tensor<64xf32>)
-    outs(%d: tensor<f32>) -> tensor<f32>
-
-  // CHECK-NEXT:   return
-  return %e : tensor<f32>
-}
-
-//      CHECK:  func @main()
-func @main() {
-  //  CHECK-DAG:   %[[C0:.*]] = arith.constant 0{{.*}} : f32
-  //  CHECK-DAG:   %[[C1:.*]] = arith.constant 1{{.*}} : f32
-  //  CHECK-DAG:   %[[C2:.*]] = arith.constant 2{{.*}} : f32
-  %v0 = arith.constant 0.0 : f32
-  %v1 = arith.constant 1.0 : f32
-  %v2 = arith.constant 2.0 : f32
-
-  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
-  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
-  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
-  //  CHECK-DAG:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  //  CHECK-DAG:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
-  //  CHECK-DAG:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
-  %A = linalg.init_tensor [64] : tensor<64xf32>
-  %B = linalg.init_tensor [64] : tensor<64xf32>
-  %C = linalg.init_tensor [] : tensor<f32>
-
-  //  CHECK-DAG:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
-  //  CHECK-DAG:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
-  //  CHECK-DAG:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
-  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
-  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
-  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>
-
-  // CHECK-NEXT:   call @init_and_dot(%[[cA]], %[[cB]], %[[cC]])
-  %res = call @init_and_dot(%AA, %BB, %CC) :
-    (tensor<64xf32>, tensor<64xf32>, tensor<f32>) -> tensor<f32>
-
-  // CHECK-NEXT:   %[[dC:.*]] = memref.cast %[[C]] : memref<f32> to memref<*xf32>
-  %res2 = tensor.cast %res: tensor<f32> to tensor<*xf32>
-
-  // CHECK-NEXT:   call @print_memref_f32(%[[dC]]) : (memref<*xf32>) -> ()
-  call @print_memref_f32(%res2) : (tensor<*xf32>) -> ()
-
-  // CHECK-DAG:   memref.dealloc %[[A]] : memref<64xf32>
-  // CHECK-DAG:   memref.dealloc %[[B]] : memref<64xf32>
-  // CHECK-DAG:   memref.dealloc %[[C]] : memref<f32>
-  // CHECK-NEXT:   return
-  return
-}
-
-//     CHECK:   func private @print_memref_f32(memref<*xf32>)
-func private @print_memref_f32(tensor<*xf32>)
-
-// -----
-
-// CHECK: #[[$DYNAMIC:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK: func private @external_func(memref<?xf32, #[[$DYNAMIC]]>)
-func private @external_func(tensor<?xf32>)
-
-//      CHECK: func @callee(
-// CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<?xf32>
-// CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
-// CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
-func @callee(%A : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>},
-             %B : tensor<?xf32>,
-             %C : tensor<?xf32>) {
-// CHECK-NEXT: %[[CASTED:.*]] = memref.cast %[[A]] : memref<?xf32> to memref<?xf32, #[[$DYNAMIC]]>
-// CHECK-NEXT: call @external_func(%[[CASTED]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
-  call @external_func(%A) : (tensor<?xf32>) -> ()
-
-// CHECK-NEXT: call @external_func(%[[B]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
-  call @external_func(%B) : (tensor<?xf32>) -> ()
-
-// CHECK-NEXT: call @external_func(%[[C]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
-  call @external_func(%C) : (tensor<?xf32>) -> ()
-
-  return
-}
-
-//      CHECK: func @entry(
-// CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<?xf32>
-// CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<?xf32>
-// CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
-func @entry(%A : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false},
-            %B : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false},
-            %C : tensor<?xf32> {linalg.inplaceable = false}) {
-// Note: `callee` does not write to its bbArg directly, but `external_func`
-// does. Inside `callee`, the writes via `external_func` do not cause a
-// conflict. However, inside `entry`, the writes do cause a conflict because
-// %A, %B and %C are not inplaceable. This test case shows that this kind of
-// conflict detection has a "transitive" nature.
-//      CHECK: %[[ALLOC_C:.*]] = memref.alloc
-//      CHECK: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]]
-//      CHECK: %[[ALLOC_B:.*]] = memref.alloc
-//      CHECK: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]]
-//      CHECK: %[[ALLOC_A:.*]] = memref.alloc
-//      CHECK: memref.copy %[[A]], %[[ALLOC_A]]
-//      CHECK: memref.copy %[[B]], %[[ALLOC_B]]
-//      CHECK: memref.copy %[[C]], %[[ALLOC_C]]
-//      CHECK: %[[CASTED_A:.*]] = memref.cast %[[ALLOC_A]]
-// CHECK-NEXT: call @callee(%[[CASTED_A]], %[[CASTED_B]], %[[CASTED_C]])
-  call @callee(%A, %B, %C) : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> ()
-  return
-}
-
-// -----
-
 //      CHECK: func @matmul(
 // CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
 // CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
@@ -900,115 +635,6 @@ func @insert_op(%t1 : tensor<?xf32> {linalg.inplaceable = true},
 
 // -----
 
-// CHECK-LABEL: func @inner_func(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @inner_func(%t: tensor<?xf32>) -> tensor<?xf32> {
-  %f = arith.constant 1.0 : f32
-  %c0 = arith.constant 0 : index
-  // CHECK: memref.store %{{.*}}, %[[arg0]]
-  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
-  return %0 : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @equivalent_func_arg(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @equivalent_func_arg(%t0: tensor<?xf32> {linalg.inplaceable = true},
-                          %c0: index, %c10: index, %c1: index) -> tensor<?xf32> {
-  // CHECK-NOT: copy
-  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
-    // CHECK: call @inner_func(%[[arg0]])
-    %3 = call @inner_func(%t1) : (tensor<?xf32>) -> tensor<?xf32>
-    scf.yield %3 : tensor<?xf32>
-  }
-  return %1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @inner_func_2(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @inner_func_2(%t: tensor<?xf32>) -> tensor<?xf32> {
-  %f = arith.constant 1.0 : f32
-  %c0 = arith.constant 0 : index
-  // CHECK: memref.store %{{.*}}, %[[arg0]]
-  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
-  return %0 : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @equivalent_func_arg_2(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @equivalent_func_arg_2(%t0: tensor<?xf32> {linalg.inplaceable = true},
-                            %c0: index, %c10: index, %c1: index) -> tensor<?xf32> {
-  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
-    // CHECK: %[[alloc:.*]] = memref.alloc
-    // CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
-    // CHECK: memref.copy %[[arg0]], %[[alloc]]
-    // CHECK: call @inner_func_2(%[[casted]])
-    %3 = call @inner_func_2(%t1) : (tensor<?xf32>) -> tensor<?xf32>
-    scf.yield %t1 : tensor<?xf32>
-  }
-  return %1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @inner_func(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
-  // CHECK-NOT: copy
-  %f = arith.constant 1.0 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  // CHECK: memref.store %{{.*}}, %[[arg0]]
-  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
-  // CHECK: %[[load:.*]] = memref.load %[[arg0]]
-  %1 = tensor.extract %0[%c1] : tensor<?xf32>
-  // CHECK: return %[[load]] : f32
-  return %0, %1 : tensor<?xf32>, f32
-}
-
-// CHECK-LABEL: func @call_func_with_non_tensor_return(
-//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func @call_func_with_non_tensor_return(
-    %t0: tensor<?xf32> {linalg.inplaceable = true}) -> (f32, tensor<?xf32>) {
-  // CHECK-NOT: copy
-  // CHECK: %[[call:.*]] = call @inner_func(%[[arg0]])
-  %0, %1 = call @inner_func(%t0) : (tensor<?xf32>) -> (tensor<?xf32>, f32)
-  // CHECK: return %[[call]] : f32
-  return %1, %0 : f32, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @func_without_tensor_args
-func @func_without_tensor_args(%v : vector<10xf32>) -> () {
-  // CHECK: %[[alloc:.*]] = memref.alloc()
-  %0 = linalg.init_tensor[10] : tensor<10xf32>
-
-  %c0 = arith.constant 0 : index
-  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
-  %1 = vector.transfer_write %v, %0[%c0] : vector<10xf32>, tensor<10xf32>
-
-  %cst = arith.constant 0.0 : f32
-  // CHECK: vector.transfer_read %[[alloc]]
-  %r = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<11xf32>
-
-  vector.print %r : vector<11xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func private @private_func
-func private @private_func(tensor<?xf32>) -> ()
-
-// CHECK-LABEL: func @empty_func()
-func @empty_func() -> () {
-  return
-}
-
-// -----
-
 func @gather_like(
     %arg0 : tensor<?x?xf32> {linalg.inplaceable = false},
     %arg1 : tensor<?xi32> {linalg.inplaceable = false},
@@ -1328,3 +954,4 @@ func @scf_for_swapping_yields(
 //       CHECK:     return %[[r0]], %[[r1]]
   return %f0, %f1: f32, f32
 }
+

diff  --git a/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir
new file mode 100644
index 0000000000000..183a07c8c8d63
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=allow-return-allocs -split-input-file | FileCheck %s
+
+// Run fuzzer with 
diff erent seeds.
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+
+// Test bufferization using memref types that have no layout map.
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs fully-dynamic-layout-maps=0" -split-input-file -o /dev/null
+
+// Make sure that the returned buffer is not deallocated.
+// TODO: Such buffers currently leak. We need buffer hoisting / ref counting for
+// this in the future.
+
+// CHECK-LABEL: func @create_tensor() -> memref<10xf32> {
+//       CHECK:   %[[alloc:.*]] = memref.alloc
+//       CHECK:   return %[[alloc]]
+func @create_tensor() -> tensor<10xf32> {
+  %0 = linalg.init_tensor [10] : tensor<10xf32>
+  return %0 : tensor<10xf32>
+}
+
+// CHECK: func @caller(
+// CHECK: %[[call:.*]] = call @create_tensor() : () -> memref<10xf32>
+// CHECK: %[[extracted:.*]] = memref.load %[[call]]
+// CHECK: return %[[extracted]]
+func @caller(%idx: index) -> f32 {
+  %0 = call @create_tensor() : () -> (tensor<10xf32>)
+  %1 = tensor.extract %0[%idx] : tensor<10xf32>
+  return %1 : f32
+}
+
+// -----
+
+// return_slice returns an aliasing tensor. In main, %t is overwritten (but not
+// read). This is a conflict because %0 is aliasing with %t. An alloc + copy is
+// needed.
+
+// CHECK-LABEL: func @return_slice(
+//   CHECK-NOT:   alloc
+//   CHECK-NOT:   copy
+//       CHECK:   memref.subview
+func @return_slice(%t: tensor<?xf32>, %sz: index) -> (tensor<?xf32>) {
+  %0 = tensor.extract_slice %t[4][%sz][1] : tensor<?xf32> to tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @main(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[alloc:.*]] = memref.alloc
+//   CHECK-DAG:   memref.copy %[[t]], %[[alloc]]
+//   CHECK-DAG:   %[[casted:.*]] = memref.cast %[[alloc]]
+//       CHECK:   %[[call:.*]] = call @return_slice(%[[casted]]
+//       CHECK:   linalg.fill ins({{.*}}) outs(%[[t]]
+//       CHECK:   memref.load %[[call]]
+//       CHECK:   memref.load %[[t]]
+func @main(%t: tensor<?xf32>, %sz: index, %idx: index) -> (f32, f32) {
+  %cst = arith.constant 1.0 : f32
+  %0 = call @return_slice(%t, %sz) : (tensor<?xf32>, index) -> (tensor<?xf32>)
+  %filled = linalg.fill ins(%cst : f32) outs(%t : tensor<?xf32>) -> tensor<?xf32>
+  %r1 = tensor.extract %0[%idx] : tensor<?xf32>
+  %r2 = tensor.extract %filled[%idx] : tensor<?xf32>
+  return %r1, %r2 : f32, f32
+}

diff  --git a/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir
new file mode 100644
index 0000000000000..72661713a5b44
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir
@@ -0,0 +1,551 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file | FileCheck %s
+
+// Run fuzzer with 
diff erent seeds.
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+
+// Test bufferization using memref types that have no layout map.
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs fully-dynamic-layout-maps=0" -split-input-file -o /dev/null
+
+// Bufferization of bodiless function with no tensor return value.
+
+// CHECK-LABEL: func private @private_func
+func private @private_func(tensor<?xf32>) -> ()
+
+// CHECK-LABEL: func @empty_func()
+func @empty_func() -> () {
+  return
+}
+
+// -----
+
+// A bodiless function that returns something that is not a tensor.
+
+// CHECK: func private @external_func_with_return_val(memref<4xi32, #{{.*}}>) -> f32
+func private @external_func_with_return_val(tensor<4xi32>) -> f32
+
+// -----
+
+// CHECK-LABEL: func private @private_func
+func private @private_func(tensor<?xf32>) -> (f32)
+
+// private_func may modify the buffer arg, but that's OK because %t is writable.
+// No alloc/copy should be inserted.
+
+// CHECK-LABEL: func @main(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//   CHECK-NOT: alloc
+//   CHECK-NOT: copy
+//       CHECK: call @private_func(%[[t]])
+func @main(%t: tensor<?xf32> {linalg.inplaceable = true}) -> (f32) {
+  %0 = call @private_func(%t) : (tensor<?xf32>) -> (f32)
+  return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func private @private_func
+func private @private_func(tensor<?xf32>) -> (f32)
+
+// private_func may modify the buffer arg, %t is not writable. A copy is needed.
+
+// CHECK-LABEL: func @main(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK: %[[alloc:.*]] = memref.alloc
+//   CHECK-DAG: memref.copy %[[t]], %[[alloc]]
+//   CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]]
+//       CHECK: call @private_func(%[[casted]])
+//       CHECK: memref.dealloc %[[alloc]]
+func @main(%t: tensor<?xf32> {linalg.inplaceable = false}) -> (f32) {
+  %0 = call @private_func(%t) : (tensor<?xf32>) -> (f32)
+  return %0 : f32
+}
+
+// -----
+
+// Test bufferization of a function without tensor args.
+
+// CHECK-LABEL: func @func_without_tensor_args
+func @func_without_tensor_args(%v : vector<10xf32>) -> () {
+  // CHECK: %[[alloc:.*]] = memref.alloc()
+  %0 = linalg.init_tensor[10] : tensor<10xf32>
+
+  %c0 = arith.constant 0 : index
+  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+  %1 = vector.transfer_write %v, %0[%c0] : vector<10xf32>, tensor<10xf32>
+
+  %cst = arith.constant 0.0 : f32
+  // CHECK: vector.transfer_read %[[alloc]]
+  %r = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<11xf32>
+
+  vector.print %r : vector<11xf32>
+  return
+}
+
+// -----
+
+// Bufferization of a function that is reading and writing. %t0 is writable, so
+// no copy should be inserted.
+
+// CHECK-LABEL: func @inner_func(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
+  // CHECK-NOT: copy
+  %f = arith.constant 1.0 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: memref.store %{{.*}}, %[[arg0]]
+  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
+  // CHECK: %[[load:.*]] = memref.load %[[arg0]]
+  %1 = tensor.extract %0[%c1] : tensor<?xf32>
+  // CHECK: return %[[load]] : f32
+  return %0, %1 : tensor<?xf32>, f32
+}
+
+// CHECK-LABEL: func @call_func_with_non_tensor_return(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @call_func_with_non_tensor_return(
+    %t0: tensor<?xf32> {linalg.inplaceable = true}) -> (f32, tensor<?xf32>) {
+  // CHECK-NOT: alloc
+  // CHECK-NOT: copy
+  // CHECK: %[[call:.*]] = call @inner_func(%[[arg0]])
+  %0, %1 = call @inner_func(%t0) : (tensor<?xf32>) -> (tensor<?xf32>, f32)
+  // CHECK: return %[[call]] : f32
+  return %1, %0 : f32, tensor<?xf32>
+}
+
+// -----
+
+// Bufferization of a function that is reading and writing. %t0 is not writable,
+// so a copy is needed.
+
+// CHECK-LABEL: func @inner_func(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
+  // CHECK-NOT: copy
+  %f = arith.constant 1.0 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: memref.store %{{.*}}, %[[arg0]]
+  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
+  // CHECK: %[[load:.*]] = memref.load %[[arg0]]
+  %1 = tensor.extract %0[%c1] : tensor<?xf32>
+  // CHECK: return %[[load]] : f32
+  return %0, %1 : tensor<?xf32>, f32
+}
+
+// CHECK-LABEL: func @call_func_with_non_tensor_return(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @call_func_with_non_tensor_return(
+    %t0: tensor<?xf32> {linalg.inplaceable = false}) -> (f32, tensor<?xf32>) {
+  // CHECK: %[[alloc:.*]] = memref.alloc
+  // CHECK-DAG: memref.copy %[[arg0]], %[[alloc]]
+  // CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]]
+  // CHECK: %[[call:.*]] = call @inner_func(%[[casted]])
+  %0, %1 = call @inner_func(%t0) : (tensor<?xf32>) -> (tensor<?xf32>, f32)
+
+  // Note: The tensor return value has folded away.
+  // CHECK: return %[[call]] : f32
+  return %1, %0 : f32, tensor<?xf32>
+}
+
+// -----
+
+// A chain of function calls. The last function f0 is potentially writing to the
+// buffer. This becomes a problem when bufferizing main and a copy must be
+// inserted then. (No copies in the other functions.)
+
+// CHECK-LABEL: func private @f0(
+func private @f0(tensor<?xf32>) -> (f32)
+
+// CHECK-LABEL: func @f1(
+//  CHECK-SAME:     %[[t1:.*]]: memref<?xf32
+//       CHECK:   %[[r1:.*]] = call @f0(%[[t1]])
+//       CHECK:   return %[[r1]]
+func @f1(%t: tensor<?xf32>) -> (f32) {
+  %0 = call @f0(%t) : (tensor<?xf32>) -> (f32)
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @f2(
+//  CHECK-SAME:     %[[t2:.*]]: memref<?xf32
+//       CHECK:   %[[r2:.*]] = call @f1(%[[t2]])
+//       CHECK:   return %[[r2]]
+func @f2(%t: tensor<?xf32>) -> (f32) {
+  %0 = call @f1(%t) : (tensor<?xf32>) -> (f32)
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @main(
+//  CHECK-SAME:     %[[t3:.*]]: memref<?xf32
+//       CHECK: %[[alloc:.*]] = memref.alloc
+//   CHECK-DAG: memref.copy %[[t3]], %[[alloc]]
+//   CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]]
+//       CHECK: call @f2(%[[casted]])
+//       CHECK: memref.dealloc %[[alloc]]
+func @main(%t: tensor<?xf32> {linalg.inplaceable = false}) -> (f32) {
+  %0 = call @f2(%t) : (tensor<?xf32>) -> (f32)
+  return %0 : f32
+}
+
+// -----
+
+// This function does not read, just write. We need an alloc, but no copy.
+
+// CHECK-LABEL: func @does_not_read(
+//   CHECK-NOT:   alloc
+//   CHECK-NOT:   copy
+func @does_not_read(%t: tensor<?xf32>) -> tensor<?xf32> {
+  %f0 = arith.constant 0.0 : f32
+  %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<?xf32>) -> tensor<?xf32>
+  return %r : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @main(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[alloc:.*]] = memref.alloc
+//   CHECK-NOT:   copy
+//       CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
+//   CHECK-NOT:   copy
+//       CHECK:   call @does_not_read(%[[casted]])
+//       CHECK:   %[[r:.*]] = memref.load %[[alloc]]
+//       CHECK:   memref.dealloc %[[alloc]]
+func @main(%t: tensor<?xf32> {linalg.inplaceable = false}) -> f32 {
+  %0 = call @does_not_read(%t) : (tensor<?xf32>) -> (tensor<?xf32>)
+  %idx = arith.constant 4 : index
+  %r = tensor.extract %0[%idx] : tensor<?xf32>
+  return %r : f32
+}
+
+// -----
+
+// Alloc and copy must be inserted because the arith.constant is read-only.
+
+//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+//      CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]>
+//      CHECK: func private @some_external_func(memref<4xi32, #[[$DYN_1D_MAP]]>)
+func private @some_external_func(tensor<4xi32>)
+
+//      CHECK: func @main()
+func @main() {
+//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
+  %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
+//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
+//      CHECK:   call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
+  call @some_external_func(%A) : (tensor<4xi32>) -> ()
+
+//      CHECK: memref.dealloc %[[alloc]]
+  return
+}
+
+// -----
+
+// Alloc and copy must be inserted because the arith.constant is read-only. The
+// function call is inside of an scf.execute_region.
+
+//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+//      CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]>
+//      CHECK: func private @some_external_func_within_scf_execute(memref<4xi32, #[[$DYN_1D_MAP]]>)
+func private @some_external_func_within_scf_execute(tensor<4xi32>)
+
+//      CHECK: func @main()
+func @main() {
+//  CHECK-DAG:   %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32>
+  %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+
+// Note: The scf.execute_region canonicalizes away.
+
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]>
+//  CHECK-DAG:   memref.copy %[[A]], %[[alloc]]
+//      CHECK:   call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> ()
+  scf.execute_region {
+    call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> ()
+    scf.yield
+  }
+
+//      CHECK:   memref.dealloc %[[alloc]]
+  return
+}
+
+// -----
+
+// A write inside an scf.execute_region. An equivalent tensor is yielded.
+
+// CHECK-LABEL: func @execute_region_test(
+//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+func @execute_region_test(%t1 : tensor<?xf32>)
+    -> (f32, tensor<?xf32>, f32)
+{
+  %f1 = arith.constant 0.0 : f32
+  %f2 = arith.constant 1.0 : f32
+  %idx = arith.constant 7 : index
+
+  // scf.execute_region is canonicalized away after bufferization. So just the
+  // memref.store is left over.
+
+  // CHECK-NOT: alloc
+  // CHECK-NOT: copy
+  // CHECK: memref.store %{{.*}}, %[[m1]][%{{.*}}]
+  %0, %1, %2 = scf.execute_region -> (f32, tensor<?xf32>, f32) {
+    %t2 = tensor.insert %f2 into %t1[%idx] : tensor<?xf32>
+    scf.yield %f1, %t2, %f2 : f32, tensor<?xf32>, f32
+  }
+
+  // CHECK: return %{{.*}}, %{{.*}} : f32, f32
+  return %0, %1, %2 : f32, tensor<?xf32>, f32
+}
+
+// -----
+
+//      CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+//      CHECK:  func private @some_external_func(memref<?xf32, #[[$DYN_1D_MAP]]>)
+func private @some_external_func(tensor<?xf32>)
+
+//      CHECK:  func @scf_for_with_tensor_insert_slice(
+// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]>
+func @scf_for_with_tensor_insert_slice(
+    %A : tensor<?xf32>, %B : tensor<?xf32>, %C : tensor<4xf32>,
+    %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  // CHECK-NEXT: scf.for
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    // CHECK-NEXT:   %[[SVA:.*]] = memref.subview %[[A]]
+    // CHECK-NEXT:   memref.copy %[[C]], %[[SVA]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]>
+    %ttA = tensor.insert_slice %C into %tA[%i][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // CHECK-NEXT:   %[[SVB:.*]] = memref.subview %[[B]]
+    // CHECK-NEXT:   memref.copy %[[C]], %[[SVB]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]>
+    %ttB = tensor.insert_slice %C into %tB[%i][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // scf.yield is empty and is elided
+    //  CHECK-NOT:   scf.yield
+    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+  }
+
+  // Swaparoo requires bufferizing the whole function to figure out who's who.
+  return %r0#1, %r0#0: tensor<?xf32>, tensor<?xf32>
+}
+
+//      CHECK:  func @bar(
+// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]>
+func @bar(
+    %A : tensor<?xf32> {linalg.inplaceable = true},
+    %B : tensor<?xf32> {linalg.inplaceable = true},
+    %C : tensor<4xf32> {linalg.inplaceable = true},
+    %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+//  CHECK-DAG:   call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]]
+  %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) :
+      (tensor<?xf32>, tensor<?xf32>, tensor<4xf32>, index, index, index)
+        -> (tensor<?xf32>, tensor<?xf32>)
+
+  // %r0#0 requires a copy because we have no idea what the function is doing.
+//  CHECK-DAG:   %[[alloc:.*]] = memref.alloc
+//  CHECK-DAG:   %[[casted:.*]] = memref.cast %[[alloc]]
+//      CHECK:   memref.copy %[[B]], %[[alloc]]
+// CHECK-NEXT:   call @some_external_func(%[[casted]]) : (memref<?xf32, #[[$DYN_1D_MAP]]>) -> ()
+  call @some_external_func(%r0#0) : (tensor<?xf32>) -> ()
+
+//      CHECK:   return
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//  CHECK-DAG: #[[$DYN_0D_MAP:.*]] = affine_map<()[s0] -> (s0)>
+//  CHECK-DAG: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+//      CHECK:  func @init_and_dot(
+// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]>
+// CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<f32, #[[$DYN_0D_MAP]]>
+func @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor<f32>) -> tensor<f32> {
+  // CHECK-NEXT:   %[[C0:.*]] = arith.constant 0{{.*}} : f32
+  %v0 = arith.constant 0.0 : f32
+
+  // CHECK-NEXT:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32, #[[$DYN_0D_MAP]]>)
+  %d = linalg.fill ins(%v0 : f32) outs(%c : tensor<f32>) -> tensor<f32>
+
+  // CHECK-NEXT:   linalg.dot ins(%[[A]], %[[B]] : memref<64xf32, #[[$DYN_1D_MAP]]>, memref<64xf32, #[[$DYN_1D_MAP]]>) outs(%[[C]] : memref<f32, #[[$DYN_0D_MAP]]>)
+  %e = linalg.dot ins(%a, %b : tensor<64xf32>,tensor<64xf32>)
+    outs(%d: tensor<f32>) -> tensor<f32>
+
+  // CHECK-NEXT:   return
+  return %e : tensor<f32>
+}
+
+//      CHECK:  func @main()
+func @main() {
+  //  CHECK-DAG:   %[[C0:.*]] = arith.constant 0{{.*}} : f32
+  //  CHECK-DAG:   %[[C1:.*]] = arith.constant 1{{.*}} : f32
+  //  CHECK-DAG:   %[[C2:.*]] = arith.constant 2{{.*}} : f32
+  %v0 = arith.constant 0.0 : f32
+  %v1 = arith.constant 1.0 : f32
+  %v2 = arith.constant 2.0 : f32
+
+  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
+  //  CHECK-DAG:   %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]>
+  //  CHECK-DAG:   %[[cC:.*]] = memref.cast %[[C]] : memref<f32> to memref<f32, #[[$DYN_0D_MAP]]>
+  %A = linalg.init_tensor [64] : tensor<64xf32>
+  %B = linalg.init_tensor [64] : tensor<64xf32>
+  %C = linalg.init_tensor [] : tensor<f32>
+
+  //  CHECK-DAG:   linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>)
+  //  CHECK-DAG:   linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref<f32>)
+  %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32>
+  %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32>
+  %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32>
+
+  // CHECK-NEXT:   call @init_and_dot(%[[cA]], %[[cB]], %[[cC]])
+  %res = call @init_and_dot(%AA, %BB, %CC) :
+    (tensor<64xf32>, tensor<64xf32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK-NEXT:   %[[dC:.*]] = memref.cast %[[C]] : memref<f32> to memref<*xf32>
+  %res2 = tensor.cast %res: tensor<f32> to tensor<*xf32>
+
+  // CHECK-NEXT:   call @print_memref_f32(%[[dC]]) : (memref<*xf32>) -> ()
+  call @print_memref_f32(%res2) : (tensor<*xf32>) -> ()
+
+  // CHECK-DAG:   memref.dealloc %[[A]] : memref<64xf32>
+  // CHECK-DAG:   memref.dealloc %[[B]] : memref<64xf32>
+  // CHECK-DAG:   memref.dealloc %[[C]] : memref<f32>
+  // CHECK-NEXT:   return
+  return
+}
+
+//     CHECK:   func private @print_memref_f32(memref<*xf32>)
+func private @print_memref_f32(tensor<*xf32>)
+
+// -----
+
+// CHECK: #[[$DYNAMIC:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK: func private @external_func(memref<?xf32, #[[$DYNAMIC]]>)
+func private @external_func(tensor<?xf32>)
+
+//      CHECK: func @callee(
+// CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<?xf32>
+// CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
+// CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
+func @callee(%A : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>},
+             %B : tensor<?xf32>,
+             %C : tensor<?xf32>) {
+// CHECK-NEXT: %[[CASTED:.*]] = memref.cast %[[A]] : memref<?xf32> to memref<?xf32, #[[$DYNAMIC]]>
+// CHECK-NEXT: call @external_func(%[[CASTED]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
+  call @external_func(%A) : (tensor<?xf32>) -> ()
+
+// CHECK-NEXT: call @external_func(%[[B]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
+  call @external_func(%B) : (tensor<?xf32>) -> ()
+
+// CHECK-NEXT: call @external_func(%[[C]]) : (memref<?xf32, #[[$DYNAMIC]]>) -> ()
+  call @external_func(%C) : (tensor<?xf32>) -> ()
+
+  return
+}
+
+//      CHECK: func @entry(
+// CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<?xf32>
+// CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<?xf32>
+// CHECK-SAME:   %[[C:[0-9a-zA-Z]*]]: memref<?xf32, #[[$DYNAMIC]]>
+func @entry(%A : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false},
+            %B : tensor<?xf32> {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false},
+            %C : tensor<?xf32> {linalg.inplaceable = false}) {
+// Note: `callee` does not write to its bbArg directly, but `external_func`
+// does. Inside `callee`, the writes via `external_func` do not cause a
+// conflict. However, inside `entry`, the writes do cause a conflict because
+// %A, %B and %C are not inplaceable. This test case shows that this kind of
+// conflict detection has a "transitive" nature.
+//      CHECK: %[[ALLOC_C:.*]] = memref.alloc
+//      CHECK: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]]
+//      CHECK: %[[ALLOC_B:.*]] = memref.alloc
+//      CHECK: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]]
+//      CHECK: %[[ALLOC_A:.*]] = memref.alloc
+//      CHECK: memref.copy %[[A]], %[[ALLOC_A]]
+//      CHECK: memref.copy %[[B]], %[[ALLOC_B]]
+//      CHECK: memref.copy %[[C]], %[[ALLOC_C]]
+//      CHECK: %[[CASTED_A:.*]] = memref.cast %[[ALLOC_A]]
+// CHECK-NEXT: call @callee(%[[CASTED_A]], %[[CASTED_B]], %[[CASTED_C]])
+  call @callee(%A, %B, %C) : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> ()
+  return
+}
+
+// -----
+
+// No alloc or copy inside of the loop.
+
+// CHECK-LABEL: func @inner_func(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @inner_func(%t: tensor<?xf32>) -> tensor<?xf32> {
+  %f = arith.constant 1.0 : f32
+  %c0 = arith.constant 0 : index
+  // CHECK: memref.store %{{.*}}, %[[arg0]]
+  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @equivalent_func_arg(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @equivalent_func_arg(%t0: tensor<?xf32> {linalg.inplaceable = true},
+                          %c0: index, %c10: index, %c1: index) -> tensor<?xf32> {
+  // CHECK-NOT: alloc
+  // CHECK-NOT: copy
+  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
+    // CHECK: call @inner_func(%[[arg0]])
+    %3 = call @inner_func(%t1) : (tensor<?xf32>) -> tensor<?xf32>
+    scf.yield %3 : tensor<?xf32>
+  }
+  return %1: tensor<?xf32>
+}
+
+// -----
+
+// inner_func_2 modifies the bbArg, but the loop yields the original value. A
+// buffer copy must be inserted inside the loop.
+
+// CHECK-LABEL: func @inner_func_2(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @inner_func_2(%t: tensor<?xf32>) -> tensor<?xf32> {
+  %f = arith.constant 1.0 : f32
+  %c0 = arith.constant 0 : index
+  // CHECK: memref.store %{{.*}}, %[[arg0]]
+  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @equivalent_func_arg_2(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
+func @equivalent_func_arg_2(%t0: tensor<?xf32> {linalg.inplaceable = true},
+                            %c0: index, %c10: index, %c1: index) -> tensor<?xf32> {
+  // CHECK: scf.for {{.*}} {
+  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
+    // CHECK: %[[alloc:.*]] = memref.alloc
+    // CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
+    // CHECK: memref.copy %[[arg0]], %[[alloc]]
+    // CHECK: call @inner_func_2(%[[casted]])
+    // CHECK: memref.dealloc %[[alloc]]
+    // CHECK-NOT: scf.yield
+    %3 = call @inner_func_2(%t1) : (tensor<?xf32>) -> tensor<?xf32>
+    scf.yield %t1 : tensor<?xf32>
+  }
+  return %1: tensor<?xf32>
+}