[Mlir-commits] [mlir] b3ebe3b - [mlir][bufferize] Bufferize after TensorCopyInsertion

Fri Jun 17 04:30:57 PDT 2022

Author: Matthias Springer
Date: 2022-06-17T13:29:52+02:00
New Revision: b3ebe3beeda64ab0072a2ef97a64520ed5d8d73f

URL: https://github.com/llvm/llvm-project/commit/b3ebe3beeda64ab0072a2ef97a64520ed5d8d73f
DIFF: https://github.com/llvm/llvm-project/commit/b3ebe3beeda64ab0072a2ef97a64520ed5d8d73f.diff

LOG: [mlir][bufferize] Bufferize after TensorCopyInsertion

This change changes the bufferization so that it utilizes the new TensorCopyInsertion pass. One-Shot Bufferize no longer calls the One-Shot Analysis. Instead, it relies on the TensorCopyInsertion pass to make the entire IR fully inplacable. The `bufferize` implementations of all ops are simplified; they no longer have to account for out-of-place bufferization decisions. These were already materialized in the IR in the form of `bufferization.alloc_tensor` ops during the TensorCopyInsertion pass.

Differential Revision: https://reviews.llvm.org/D127652

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
    mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
    mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h
    mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
    mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
    mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
    mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
    mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
    mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
    mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
    mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
    mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
    mlir/test/Dialect/Linalg/bufferize.mlir
    mlir/test/Dialect/SCF/one-shot-bufferize.mlir
    mlir/test/Dialect/Tensor/one-shot-bufferize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 7a1e8da42e00..3cd9d70138d1 100644

--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -467,38 +467,12 @@ class AnalysisState {
 /// BufferizationState provides helper functions for performing bufferization
 /// rewrites and handling memref buffers.
 struct BufferizationState {
-  enum ForceInPlacability { FORCE_INPLACE, FORCE_OUT_OF_PLACE };
+  BufferizationState(const BufferizationOptions &options) : options(options) {}
 
-  BufferizationState(const AnalysisState &analysisState)
-      : analysisState(analysisState) {}
-
-  /// Creates a memref allocation for the given shaped value. `dealloc`
-  /// indicates whether the buffer should be deallocated or not. When `dealloc`
-  /// is `false`, this would create a memory leak, unless the buffer is
-  /// deallocated through some other mechanism.
-  ///
-  /// `dealloc` is optional. By default, this function will figure out by itself
-  /// if it is safe to deallocate the buffer. In essence, when returning the
-  /// buffer from a block, it is not safe to deallocate the buffer. This
-  /// information is queried via `AnalysisState::isTensorYielded`.
-  ///
-  /// Note: `shapedValue` is typically a tensor value. However, if it is a
-  /// memref value, `dealloc` is no longer optional and must be specified.
-  FailureOr<Value> createAlloc(OpBuilder &b, Location loc, Value shapedValue,
-                               Optional<bool> dealloc = None);
-
-  /// Return the buffer (memref) for a given OpOperand (tensor). Allocate
-  /// a new buffer and copy over data from the existing buffer if out-of-place
-  /// bufferization was decided.
-  ///
-  /// Whether a buffer is in-place or out-of-place is queried from the analysis
-  /// state. Some analyses may always conservatively opt for out-of-place
-  /// bufferization. Inplacability decisions can be overridden with the optional
-  /// `overrideInPlace` parameter.
-  FailureOr<Value>
-  getBuffer(RewriterBase &rewriter, OpOperand &opOperand,
-            Optional<ForceInPlacability> overrideInPlace = None,
-            Optional<Operation *> customCopyInsertionPoint = None);
+  /// Lookup the buffer for the given value. If the value was not bufferized
+  /// yet, wrap it in a ToMemrefOp. Otherwise, it is the result of a ToTensorOp,
+  /// from which the memref operand is returned.
+  Value getBuffer(RewriterBase &rewriter, Value value);
 
   /// Return the buffer type for a given Value (tensor) after bufferization.
   ///
@@ -507,36 +481,28 @@ struct BufferizationState {
   BaseMemRefType getBufferType(Value value) const;
 
   /// Return a reference to the BufferizationOptions.
-  const BufferizationOptions &getOptions() const {
-    return analysisState.getOptions();
-  }
-
-  const AnalysisState &getAnalysisState() const { return analysisState; }
+  const BufferizationOptions &getOptions() const { return options; }
 
 protected:
   // BufferizationState should be passed as a reference.
   BufferizationState(const BufferizationState &) = delete;
 
 private:
-  const AnalysisState &analysisState;
+  const BufferizationOptions &options;
 };
 
+/// Create an AllocTensorOp for the given shaped value (memref or tensor).
+/// If `copy` is set, the shaped value is copied. Otherwise, a tensor with
+/// undefined contents is allocated.
+Value allocateTensorForShapedValue(OpBuilder &b, Location loc,
+                                   Value shapedValue, bool escape,
+                                   bool copy = true);
+
 /// Replace an op with replacement values. The op is deleted. Tensor OpResults
 /// must be replaced with memref values.
 void replaceOpWithBufferizedValues(RewriterBase &rewriter, Operation *op,
                                    ValueRange values);
 
-/// Lookup the buffer for the given value. If the value was not bufferized yet,
-/// wrap it in a ToMemrefOp. Otherwise, it is the result of a ToTensorOp, from
-/// which the memref operand is returned.
-///
-/// Note: Use `BufferizationState::getBuffer` during bufferization.
-/// `lookupBuffer` is just for compatibility and gradual migration of
-/// bufferization patterns to BufferizableOpInterface-based bufferization. It
-/// does not insert any buffer copies.
-Value lookupBuffer(RewriterBase &rewriter, Value tensor,
-                   const BufferizationOptions &options);
-
 /// Replace an op with a new op. The new op must have the same number of
 /// results as the replaced op. The new op may not return any tensor values.
 template <typename OpTy, typename... Args>

diff  --git a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
index e68e4502e6c5..5c72b5ea5c66 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
@@ -33,6 +33,11 @@
 
 namespace mlir {
 namespace bufferization {
+/// Populate `dynamicDims` with tensor::DimOp / memref::DimOp results for all
+/// dynamic dimensions of the given shaped value.
+void populateDynamicDimSizes(OpBuilder &b, Location loc, Value shapedValue,
+                             SmallVector<Value> &dynamicDims);
+
 /// Try to cast the given ranked MemRef-typed value to the given ranked MemRef
 /// type. Insert a reallocation + copy if it cannot be statically guaranteed
 /// that a direct cast would be valid.

diff  --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h
index faa94b437458..92cb346b265f 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h
@@ -54,40 +54,22 @@ void populateEliminateBufferizeMaterializationsPatterns(
     BufferizeTypeConverter &typeConverter, RewritePatternSet &patterns);
 
 /// Bufferize `op` and its nested ops that implement `BufferizableOpInterface`.
-/// Whether buffer copies are needed or not is queried from `state`.
+/// If `copyBeforeWrite`, buffers are duplicated and copied before any tensor
+/// use that bufferizes to a memory write.
 ///
-/// Note: If `allowUnknownOps` is set to false, bufferization fails when an
-/// unknown op (that does not implement `BufferizableOpInterface`) is found. No
-/// to_tensor/to_memref ops are inserted in that case.
-///
-/// Note: The layout map chosen to bufferize is the most dynamic canonical
-/// strided layout of the proper rank. This ensures compatibility with expected
-/// layouts after transformations. Combinations of memref.cast +
-/// canonicalization are responsible for clean ups.
-// TODO: Extract `options` from `state` and pass as separate argument.
-LogicalResult bufferizeOp(Operation *op, const AnalysisState &analysisState);
-
-/// Bufferize `op` and its nested ops that implement `BufferizableOpInterface`.
-/// Buffers are duplicated and copied before any tensor use that bufferizes to
-/// a memory write.
+/// Note: In the general case, it unsafe to run with `copyBeforeWrite = false`
+/// because read-after-write conflicts may materialize during bufferization.
+/// `copyBeforeWrite = false` is safe only if the input IR is guaranteed to
+/// *not* require any out-of-place bufferization.
 ///
 /// Note: This function bufferizes ops without utilizing analysis results. It
 /// can be used to implement partial bufferization passes.
-LogicalResult bufferizeOp(Operation *op, const BufferizationOptions &options);
+LogicalResult bufferizeOp(Operation *op, const BufferizationOptions &options,
+                          bool copyBeforeWrite = true,
+                          const OpFilter *opFilter = nullptr);
 
 BufferizationOptions getPartialBufferizationOptions();
 
-//===----------------------------------------------------------------------===//
-// Helper functions for extending Bufferization
-//===----------------------------------------------------------------------===//
-
-/// Bufferize `op` and its nested ops that implement `BufferizableOpInterface`.
-/// Reuse an existing `BufferizationState`.
-///
-/// Note: This function overload is useful for extending the bufferization.
-LogicalResult bufferizeOp(Operation *op, BufferizationState &bufferizationState,
-                          const OpFilter *opFilter = nullptr);
-
 } // namespace bufferization
 } // namespace mlir
 

diff  --git a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
index 3bdd3d92cfdc..7bd89762e8a4 100644
--- a/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -84,7 +84,7 @@ struct IndexCastOpInterface
     auto castOp = cast<arith::IndexCastOp>(op);
     auto resultTensorType = castOp.getType().cast<TensorType>();
 
-    Value source = *state.getBuffer(rewriter, op->getOpOperand(0) /*in*/);
+    Value source = state.getBuffer(rewriter, castOp.getIn());
     auto sourceType = source.getType().cast<BaseMemRefType>();
 
     // Result type should have same layout and address space as the source type.
@@ -136,15 +136,12 @@ struct SelectOpInterface
     auto selectOp = cast<arith::SelectOp>(op);
     Location loc = selectOp.getLoc();
 
-    // `getBuffer` introduces copies if an OpOperand bufferizes out-of-place.
     // TODO: It would be more efficient to copy the result of the `select` op
     // instead of its OpOperands. In the worst case, 2 copies are inserted at
     // the moment (one for each tensor). When copying the op result, only one
     // copy would be needed.
-    Value trueBuffer =
-        *state.getBuffer(rewriter, selectOp->getOpOperand(1) /*true_value*/);
-    Value falseBuffer =
-        *state.getBuffer(rewriter, selectOp->getOpOperand(2) /*false_value*/);
+    Value trueBuffer = state.getBuffer(rewriter, selectOp.getTrueValue());
+    Value falseBuffer = state.getBuffer(rewriter, selectOp.getFalseValue());
 
     // The "true" and the "false" operands must have the same type. If the
     // buffers have 
diff erent types, they 
diff er only in their layout map. Cast

diff  --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index cd3d4065faa7..8279d7efce65 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -43,30 +43,49 @@ using namespace bufferization;
 constexpr const ::llvm::StringLiteral
     bufferization::BufferizableOpInterface::kInplaceableAttrName;
 
-/// Create an AllocTensorOp for the given shaped value. Only ranked tensors are
-/// supported at the moment. If `copy` is set, the shaped value is copied.
-/// Otherwise, a tensor with undefined contents is allocated.
-static Value allocateTensorForShapedValue(OpBuilder &b, Location loc,
-                                          Value shapedValue, bool escape,
-                                          bool copy = true) {
-  auto tensorType = shapedValue.getType().dyn_cast<RankedTensorType>();
-  assert(tensorType && "only RankedTensorType supported at the moment");
-  Value alloc;
-  if (!copy) {
-    // No copy needed: Just allocate.
-    SmallVector<Value> dynamicSizes;
-    for (int64_t i = 0; i < tensorType.getRank(); ++i)
-      if (tensorType.isDynamicDim(i))
-        dynamicSizes.push_back(b.create<tensor::DimOp>(loc, shapedValue, i));
-    alloc = b.create<AllocTensorOp>(loc, tensorType, dynamicSizes,
-                                    /*copy=*/Value(), escape);
+/// Create an AllocTensorOp for the given shaped value. If `copy` is set, the
+/// shaped value is copied. Otherwise, a tensor with undefined contents is
+/// allocated.
+Value bufferization::allocateTensorForShapedValue(OpBuilder &b, Location loc,
+                                                  Value shapedValue,
+                                                  bool escape, bool copy) {
+  Value tensor;
+  if (shapedValue.getType().isa<RankedTensorType>()) {
+    tensor = shapedValue;
+  } else if (shapedValue.getType().isa<MemRefType>()) {
+    tensor = b.create<ToTensorOp>(loc, shapedValue);
   } else {
-    // Allocate and copy.
-    alloc = b.create<AllocTensorOp>(loc, tensorType,
-                                    /*dynamicSizes=*/ValueRange(), shapedValue,
-                                    escape);
+    llvm_unreachable("expected RankedTensorType or MemRefType");
   }
-  return alloc;
+  RankedTensorType tensorType = tensor.getType().cast<RankedTensorType>();
+  SmallVector<Value> dynamicSizes;
+  if (!copy) {
+    // Compute the dynamic part of the shape.
+    // First try to query the shape via ReifyRankedShapedTypeOpInterface.
+    bool reifiedShapes = false;
+    if (shapedValue.getType().isa<RankedTensorType>() &&
+        shapedValue.isa<OpResult>()) {
+      if (auto rankedOp = dyn_cast_or_null<ReifyRankedShapedTypeOpInterface>(
+              shapedValue.getDefiningOp())) {
+        ReifiedRankedShapedTypeDims resultDims;
+        if (succeeded(rankedOp.reifyResultShapes(b, resultDims))) {
+          reifiedShapes = true;
+          auto &shape =
+              resultDims[shapedValue.cast<OpResult>().getResultNumber()];
+          for (const auto &dim : enumerate(tensorType.getShape()))
+            if (ShapedType::isDynamic(dim.value()))
+              dynamicSizes.push_back(shape[dim.index()]);
+        }
+      }
+    }
+
+    // If the shape could not be reified, create DimOps.
+    if (!reifiedShapes)
+      populateDynamicDimSizes(b, loc, tensor, dynamicSizes);
+  }
+
+  return b.create<AllocTensorOp>(loc, tensorType, dynamicSizes,
+                                 copy ? tensor : Value(), escape);
 }
 
 LogicalResult BufferizableOpInterface::resolveTensorOpOperandConflicts(
@@ -379,6 +398,10 @@ bool AnalysisState::canOmitTensorCopy(OpOperand &opOperand) const {
 }
 
 bool AnalysisState::isInPlace(OpOperand &opOperand) const {
+  // ToMemrefOps are always in-place.
+  if (isa<ToMemrefOp>(opOperand.getOwner()))
+    return true;
+
   // In the absence of analysis information, OpOperands that bufferize to a
   // memory write are out-of-place, i.e., an alloc and copy is inserted.
   return !bufferizesToMemoryWrite(opOperand);
@@ -454,85 +477,21 @@ static void ensureToMemrefOpIsValid(Value tensor, Type memrefType) {
 #endif
 }
 
-Value mlir::bufferization::lookupBuffer(RewriterBase &rewriter, Value tensor,
-                                        const BufferizationOptions &options) {
-  auto tensorType = tensor.getType().dyn_cast<TensorType>();
+Value BufferizationState::getBuffer(RewriterBase &rewriter, Value value) {
+  auto tensorType = value.getType().dyn_cast<TensorType>();
   assert(tensorType && "unexpected non-tensor type");
 
   // Replace "%t = to_tensor %m" with %m.
-  if (auto toTensorOp = tensor.getDefiningOp<bufferization::ToTensorOp>())
+  if (auto toTensorOp = value.getDefiningOp<bufferization::ToTensorOp>())
     return toTensorOp.memref();
 
   // Insert to_memref op.
   OpBuilder::InsertionGuard g(rewriter);
-  setInsertionPointAfter(rewriter, tensor);
-  Type memrefType = getMemRefType(tensorType, options);
-  ensureToMemrefOpIsValid(tensor, memrefType);
-  return rewriter.create<bufferization::ToMemrefOp>(tensor.getLoc(), memrefType,
-                                                    tensor);
-}
-
-/// Return the buffer (memref) for a given OpOperand (tensor). Allocate
-/// a new buffer and copy over data from the existing buffer if out-of-place
-/// bufferization was decided.
-FailureOr<Value>
-BufferizationState::getBuffer(RewriterBase &rewriter, OpOperand &opOperand,
-                              Optional<ForceInPlacability> overrideInPlace,
-                              Optional<Operation *> customCopyInsertionPoint) {
-  const BufferizationOptions &options = analysisState.getOptions();
-  OpBuilder::InsertionGuard guard(rewriter);
-  Operation *op = opOperand.getOwner();
-  Location loc = op->getLoc();
-  SmallVector<OpResult> aliasingOpResults =
-      analysisState.getAliasingOpResult(opOperand);
-  Value operand = opOperand.get();
-  Value operandBuffer = lookupBuffer(rewriter, operand, options);
-
-  // Can `operandBuffer` be used directly or do we need a copy?
-  bool inplace =
-      overrideInPlace != FORCE_OUT_OF_PLACE &&
-      (overrideInPlace == FORCE_INPLACE || analysisState.isInPlace(opOperand));
-  if (inplace)
-    return operandBuffer;
-
-  // Bufferizing out-of-place: Allocate a new buffer.
-  // Move insertion point right after `operandBuffer`. That is where the
-  // allocation should be inserted (in the absence of allocation hoisting).
-  setInsertionPointAfter(rewriter, operandBuffer);
-  // Allocate the result buffer. The buffer should be deallocated if the tensor
-  // is not yielded and deallocs are enabled in general.
-  bool dealloc = llvm::none_of(aliasingOpResults, [&](Value v) {
-    return getAnalysisState().isTensorYielded(v);
-  });
-  FailureOr<Value> resultBuffer = createAlloc(
-      rewriter, loc, operandBuffer, dealloc && getOptions().createDeallocs);
-  if (failed(resultBuffer))
-    return failure();
-  // Do not copy the buffer if its contents are undefined.
-  if (analysisState.hasUndefinedContents(&opOperand))
-    return resultBuffer;
-  // Do not copy if the copied data is never read.
-  if (!aliasingOpResults.empty() &&
-      !analysisState.bufferizesToMemoryRead(opOperand) &&
-      llvm::none_of(aliasingOpResults, [&](OpResult opResult) {
-        return analysisState.isValueRead(opResult);
-      }))
-    return resultBuffer;
-  // Do not copy if this op does not read the data, but writes it.
-  if (analysisState.bufferizesToMemoryWrite(opOperand) &&
-      !analysisState.bufferizesToMemoryRead(opOperand))
-    return resultBuffer;
-
-  if (customCopyInsertionPoint) {
-    rewriter.setInsertionPoint(*customCopyInsertionPoint);
-  } else {
-    // The copy happens right before the op that is bufferized.
-    rewriter.setInsertionPoint(op);
-  }
-  if (failed(options.createMemCpy(rewriter, loc, operandBuffer, *resultBuffer)))
-    return failure();
-
-  return resultBuffer;
+  setInsertionPointAfter(rewriter, value);
+  Type memrefType = getMemRefType(tensorType, getOptions());
+  ensureToMemrefOpIsValid(value, memrefType);
+  return rewriter.create<bufferization::ToMemrefOp>(value.getLoc(), memrefType,
+                                                    value);
 }
 
 /// Return the buffer type for a given Value (tensor) after bufferization.
@@ -588,9 +547,12 @@ FailureOr<Value> BufferizationOptions::createAlloc(OpBuilder &b, Location loc,
     return (*allocationFn)(b, loc, type, dynShape, bufferAlignment);
 
   // Default bufferallocation via AllocOp.
-  Value allocated = b.create<memref::AllocOp>(
-      loc, type, dynShape, b.getI64IntegerAttr(bufferAlignment));
-  return allocated;
+  if (bufferAlignment != 0)
+    return b
+        .create<memref::AllocOp>(loc, type, dynShape,
+                                 b.getI64IntegerAttr(bufferAlignment))
+        .getResult();
+  return b.create<memref::AllocOp>(loc, type, dynShape).getResult();
 }
 
 /// Creates a memref deallocation. The given memref buffer must have been
@@ -605,93 +567,6 @@ LogicalResult BufferizationOptions::createDealloc(OpBuilder &b, Location loc,
   return success();
 }
 
-static MemRefType getContiguousMemRefType(ShapedType shapedType,
-                                          Attribute memorySpace = {}) {
-  MemRefLayoutAttrInterface layout = {};
-  return MemRefType::get(shapedType.getShape(), shapedType.getElementType(),
-                         layout, memorySpace);
-}
-
-/// Compute the type of the `memref` to use for allocating the buffer for
-/// `shapedValue`. Also returns (by reference in `dynShape`), the value for the
-/// dynamic dimensions in the returned `memref` type.
-static MemRefType getAllocationTypeAndShape(OpBuilder &b, Location loc,
-                                            Value shapedValue,
-                                            SmallVectorImpl<Value> &dynShape) {
-  MemRefType allocMemRefType =
-      getContiguousMemRefType(shapedValue.getType().cast<ShapedType>());
-
-  // Compute the dynamic part of the shape.
-  bool reifiedShapes = false;
-  if (auto rankedOp = dyn_cast_or_null<ReifyRankedShapedTypeOpInterface>(
-          shapedValue.getDefiningOp())) {
-    ReifiedRankedShapedTypeDims resultDims;
-    if (succeeded(rankedOp.reifyResultShapes(b, resultDims))) {
-      reifiedShapes = true;
-      OpResult resultValue = shapedValue.dyn_cast<OpResult>();
-      auto &shape = resultDims[resultValue.getResultNumber()];
-      for (const auto &dim : enumerate(allocMemRefType.getShape()))
-        if (ShapedType::isDynamic(dim.value()))
-          dynShape.push_back(shape[dim.index()]);
-    }
-  }
-
-  if (!reifiedShapes) {
-    for (const auto &dim : enumerate(allocMemRefType.getShape()))
-      if (ShapedType::isDynamic(dim.value())) {
-        assert((shapedValue.getType().isa<UnrankedMemRefType>() ||
-                shapedValue.getType().isa<MemRefType>()) &&
-               "expected MemRef type");
-        dynShape.push_back(
-            b.create<memref::DimOp>(loc, shapedValue, dim.index()));
-      }
-  }
-
-  return allocMemRefType;
-}
-
-/// Create an allocation after `shapedValue.getDefiningOp` (or at the top of the
-/// block in case of a bbArg).
-FailureOr<Value> BufferizationState::createAlloc(OpBuilder &b, Location loc,
-                                                 Value shapedValue,
-                                                 Optional<bool> dealloc) {
-  // Take a guard before anything else.
-  OpBuilder::InsertionGuard g(b);
-
-  // Compute allocation memref type.
-  assert(shapedValue.getType().isa<ShapedType>());
-  SmallVector<Value> dynShape;
-  MemRefType allocMemRefType =
-      getAllocationTypeAndShape(b, loc, shapedValue, dynShape);
-
-  // Create the buffer allocation.
-  FailureOr<Value> buffer =
-      getOptions().createAlloc(b, loc, allocMemRefType, dynShape);
-  if (failed(buffer))
-    return failure();
-
-  // Should be the buffer be deallocated again or should we let it leak?
-  if (dealloc) {
-    if (!dealloc.getValue())
-      return *buffer;
-  } else {
-    assert(shapedValue.getType().isa<TensorType>() &&
-           "must specify `dealloc` if non-tensor value is passed");
-    // Buffer should be not be deallocated if deallocs are generally deactivated
-    // or if the tensor is yielded from a block.
-    if (!getOptions().createDeallocs ||
-        getAnalysisState().isTensorYielded(shapedValue))
-      return *buffer;
-  }
-
-  // Create buffer deallocation.
-  b.setInsertionPoint(b.getInsertionBlock()->getTerminator());
-  if (failed(getOptions().createDealloc(b, loc, *buffer)))
-    return failure();
-
-  return *buffer;
-}
-
 /// Create a memory copy between two memref buffers.
 LogicalResult BufferizationOptions::createMemCpy(OpBuilder &b, Location loc,
                                                  Value from, Value to) const {

diff  --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 0025215882db..ad73f9da70ff 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -129,33 +129,85 @@ LogicalResult mlir::bufferization::foldToMemrefToTensorPair(
   return success();
 }
 
+void mlir::bufferization::populateDynamicDimSizes(
+    OpBuilder &b, Location loc, Value shapedValue,
+    SmallVector<Value> &dynamicDims) {
+  auto shapedType = shapedValue.getType().cast<ShapedType>();
+  for (int64_t i = 0; i < shapedType.getRank(); ++i) {
+    if (shapedType.isDynamicDim(i)) {
+      if (shapedType.isa<MemRefType>()) {
+        dynamicDims.push_back(b.create<memref::DimOp>(loc, shapedValue, i));
+      } else {
+        assert(shapedType.isa<RankedTensorType>() && "expected tensor");
+        dynamicDims.push_back(b.create<tensor::DimOp>(loc, shapedValue, i));
+      }
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // AllocTensorOp
 //===----------------------------------------------------------------------===//
 
 LogicalResult AllocTensorOp::bufferize(RewriterBase &rewriter,
                                        BufferizationState &state) {
+  OpBuilder::InsertionGuard g(rewriter);
+  Location loc = getLoc();
+
   // Nothing to do for dead AllocTensorOps.
-  if (getOperation()->getUses().empty())
+  if (getOperation()->getUses().empty()) {
+    rewriter.eraseOp(getOperation());
     return success();
+  }
 
-  Optional<bool> dealloc = llvm::None;
-  if (escape().hasValue())
-    dealloc = !*escape();
+  // Create buffer allocation.
+  Value copyBuffer;
+  if (copy())
+    copyBuffer = state.getBuffer(rewriter, copy());
+  auto allocType =
+      MemRefType::get(getType().getShape(), getType().getElementType());
+  SmallVector<Value> dynamicDims = dynamicSizes();
+  if (copy()) {
+    assert(dynamicDims.empty() && "expected either `copy` or `dynamicDims`");
+    populateDynamicDimSizes(rewriter, loc, copyBuffer, dynamicDims);
+  }
   FailureOr<Value> alloc =
-      state.createAlloc(rewriter, getLoc(), getResult(), dealloc);
+      state.getOptions().createAlloc(rewriter, loc, allocType, dynamicDims);
   if (failed(alloc))
     return failure();
+
+  // Create memory copy (if any).
   if (copy()) {
-    FailureOr<Value> copyValueBuffer = state.getBuffer(
-        rewriter, getOperation()->getOpOperand(getNumOperands() - 1));
-    if (failed(copyValueBuffer))
-      return failure();
-    if (failed(state.getOptions().createMemCpy(rewriter, getLoc(),
-                                               *copyValueBuffer, *alloc)))
+    if (failed(
+            state.getOptions().createMemCpy(rewriter, loc, copyBuffer, *alloc)))
       return failure();
   }
+
+  // Should the buffer be deallocated?
+  AnalysisState analysisState(state.getOptions());
+  bool dealloc;
+  if (escape().hasValue()) {
+    dealloc = !*escape();
+  } else {
+    // No "escape" annotation found.
+    if (state.getOptions().createDeallocs) {
+      // Perform an ad-hoc analysis.
+      dealloc = !analysisState.isTensorYielded(getResult());
+    } else {
+      dealloc = false;
+    }
+  }
+
+  // Replace op.
   replaceOpWithBufferizedValues(rewriter, getOperation(), *alloc);
+
+  // Create buffer deallocation (if requested).
+  if (!dealloc)
+    return success();
+
+  rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator());
+  if (failed(state.getOptions().createDealloc(rewriter, loc, *alloc)))
+    return failure();
   return success();
 }
 

diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index f2edf88bdf4f..8f4d2066e092 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/TensorCopyInsertion.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Operation.h"
@@ -288,30 +289,17 @@ static bool hasTensorSemantics(Operation *op) {
   return hasTensorResult || hasTensorOperand;
 }
 
-LogicalResult bufferization::bufferizeOp(Operation *op,
-                                         const AnalysisState &analysisState) {
-  // Catch incorrect API usage.
-  assert((analysisState.hasDialectState(
-              func::FuncDialect::getDialectNamespace()) ||
-          !analysisState.getOptions().bufferizeFunctionBoundaries) &&
-         "must use ModuleBufferize to bufferize function boundaries");
-
-  BufferizationState bufferizationState(analysisState);
-  if (failed(bufferizeOp(op, bufferizationState)))
-    return failure();
-  return success();
-}
-
 namespace {
 /// A rewriter that keeps track of extra information during bufferization.
 class BufferizationRewriter : public IRRewriter {
 public:
   BufferizationRewriter(MLIRContext *ctx, DenseSet<Operation *> &erasedOps,
                         DenseSet<Operation *> &toMemrefOps,
+                        SmallVector<Operation *> &worklist,
                         const BufferizationOptions &options,
                         const OpFilter *opFilter)
       : IRRewriter(ctx), erasedOps(erasedOps), toMemrefOps(toMemrefOps),
-        options(options), opFilter(opFilter) {}
+        worklist(worklist), analysisState(options), opFilter(opFilter) {}
 
 protected:
   void notifyOperationRemoved(Operation *op) override {
@@ -323,6 +311,7 @@ class BufferizationRewriter : public IRRewriter {
 
   void notifyOperationInserted(Operation *op) override {
     IRRewriter::notifyOperationInserted(op);
+    erasedOps.erase(op);
 
     // Keep track of to_memref ops.
     if (isa<ToMemrefOp>(op)) {
@@ -338,14 +327,24 @@ class BufferizationRewriter : public IRRewriter {
     if (!hasTensorSemantics(op))
       return;
 
-    // Skip ops that are not allowed.
+    // Skip ops that are not allowed to be bufferized.
+    auto const &options = analysisState.getOptions();
     if (!options.isOpAllowed(op) || (opFilter && !opFilter->isOpAllowed(op)))
       return;
 
-    // Adding new bufferizable ops is not allowed during bufferization. Such ops
-    // would not be analyzed and can lead to surprising behavior.
-    llvm_unreachable(
-        "creating new tensor ops is not allowed during bufferization");
+#ifndef NDEBUG
+    // Read-only tensor ops may be created during bufferization. Ops that are
+    // writing should not be created because such ops were never analyzed.
+    // Bufferizing such ops could introduce a RaW conflict.
+    for (OpOperand &operand : op->getOpOperands())
+      if (operand.get().getType().isa<TensorType>())
+        assert(!analysisState.bufferizesToMemoryWrite(operand) &&
+               "creating tensor ops that bufferize to a memory write is not "
+               "allowed during bufferization");
+#endif // NDEBUG
+
+    // Add op to worklist.
+    worklist.push_back(op);
   }
 
 private:
@@ -355,23 +354,32 @@ class BufferizationRewriter : public IRRewriter {
   /// A set of all to_memref ops.
   DenseSet<Operation *> &toMemrefOps;
 
-  /// The bufferization options.
-  /// Used for debug modes.
-  LLVM_ATTRIBUTE_UNUSED
-  const BufferizationOptions &options;
+  /// The worklist of ops to be bufferized.
+  SmallVector<Operation *> &worklist;
+
+  /// The analysis state. Used for debug assertions and access to the
+  /// bufferization options.
+  const AnalysisState analysisState;
 
+  /// An extra op filter for bufferization.
   const OpFilter *opFilter;
 };
 } // namespace
 
 LogicalResult bufferization::bufferizeOp(Operation *op,
-                                         BufferizationState &bufferizationState,
+                                         const BufferizationOptions &options,
+                                         bool copyBeforeWrite,
                                          const OpFilter *opFilter) {
-  const auto &options = bufferizationState.getOptions();
   assert(options.unknownTypeConversion !=
              BufferizationOptions::LayoutMapOption::InferLayoutMap &&
          "invalid layout map option");
 
+  if (copyBeforeWrite) {
+    AnalysisState state(options);
+    if (failed(insertTensorCopies(op, state)))
+      return failure();
+  }
+
   // Keep track of to_memref ops.
   DenseSet<Operation *> toMemrefOps;
   op->walk([&](ToMemrefOp toMemrefOp) { toMemrefOps.insert(toMemrefOp); });
@@ -393,8 +401,9 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
   DenseSet<Operation *> erasedOps;
 
   // Bufferize all ops.
+  BufferizationState bufferizationState(options);
   BufferizationRewriter rewriter(op->getContext(), erasedOps, toMemrefOps,
-                                 bufferizationState.getOptions(), opFilter);
+                                 worklist, options, opFilter);
   for (unsigned i = 0; i < worklist.size(); ++i) {
     Operation *op = worklist[i];
     // Skip ops that were erased.
@@ -443,23 +452,22 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
     // Ops without any uses and no side effects will fold away.
     if (op->getUses().empty() && MemoryEffectOpInterface::hasNoEffect(op))
       continue;
+    // ToTensorOps/ToMemrefOps are allowed in the output.
+    if (isa<ToTensorOp, ToMemrefOp>(op))
+      continue;
     return op->emitError("op was not bufferized");
   }
 
   return success();
 }
 
-LogicalResult bufferization::bufferizeOp(Operation *op,
-                                         const BufferizationOptions &options) {
-  AnalysisState state(options);
-  return bufferizeOp(op, state);
-}
-
 BufferizationOptions bufferization::getPartialBufferizationOptions() {
   BufferizationOptions options;
   options.allowUnknownOps = true;
   options.createDeallocs = false;
+  options.enforceAliasingInvariants = false;
   options.unknownTypeConversion =
       BufferizationOptions::LayoutMapOption::IdentityLayoutMap;
+  options.opFilter.allowDialect<BufferizationDialect>();
   return options;
 }

diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index 6cdd8b494215..e75338594d5b 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -306,12 +306,8 @@ struct CallOpInterface
 
       // Retrieve buffers for tensor operands.
       Value buffer = newOperands[idx];
-      if (!buffer) {
-        FailureOr<Value> bufferOrFailure = state.getBuffer(rewriter, opOperand);
-        if (failed(bufferOrFailure))
-          return failure();
-        buffer = *bufferOrFailure;
-      }
+      if (!buffer)
+        buffer = state.getBuffer(rewriter, opOperand.get());
 
       // Caller / callee type mismatch is handled with a CastOp.
       auto memRefType = funcType.getInput(idx);

diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 5447f6b0bdc2..72e9600e44f4 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -46,6 +46,7 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/Bufferization/Transforms/TensorCopyInsertion.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/AsmState.h"
@@ -989,9 +990,9 @@ LogicalResult
 bufferization::runOneShotBufferize(Operation *op,
                                    const OneShotBufferizationOptions &options) {
   OneShotAnalysisState state(op, options);
-  if (failed(analyzeOp(op, state)))
+  if (failed(insertTensorCopies(op, options)))
     return failure();
   if (options.testAnalysisOnly)
     return success();
-  return bufferizeOp(op, state);
+  return bufferizeOp(op, options, /*copyBeforeWrite=*/false);
 }

diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
index 233dedb8d20c..773fa4769244 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
@@ -64,6 +64,7 @@
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/TensorCopyInsertion.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Operation.h"
@@ -428,7 +429,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
   assert(options.bufferizeFunctionBoundaries &&
          "expected that function boundary bufferization is activated");
   IRRewriter rewriter(moduleOp.getContext());
-  BufferizationState bufferizationState(analysisState);
+  BufferizationState bufferizationState(options);
 
   // A list of functions in the order in which they are analyzed + bufferized.
   SmallVector<func::FuncOp> orderedFuncOps;
@@ -443,7 +444,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
   for (func::FuncOp funcOp : orderedFuncOps) {
     // Note: It would be good to apply cleanups here but we cannot as aliasInfo
     // would be invalidated.
-    if (failed(bufferizeOp(funcOp, bufferizationState)))
+    if (failed(bufferizeOp(funcOp, options, /*copyBeforeWrite=*/false)))
       return failure();
     // Change buffer return types to more precise layout maps.
     if (options.functionBoundaryTypeConversion ==
@@ -465,7 +466,7 @@ LogicalResult mlir::bufferization::runOneShotModuleBufferize(
   assert(options.bufferizeFunctionBoundaries &&
          "expected that function boundary bufferization is activated");
   OneShotAnalysisState analysisState(moduleOp, options);
-  if (failed(analyzeModuleOp(moduleOp, analysisState)))
+  if (failed(insertTensorCopies(moduleOp, options)))
     return failure();
   if (options.testAnalysisOnly)
     return success();

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
index bccae3b8ba69..3ecab39cce61 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -46,23 +46,15 @@ static LogicalResult bufferizeLinalgOp(RewriterBase &rewriter, LinalgOp op,
       newInputBuffers.push_back(opOperand->get());
       continue;
     }
-    // Input operands are never written to.
-    newInputBuffers.push_back(*state.getBuffer(
-        rewriter, *opOperand,
-        BufferizationState::ForceInPlacability::FORCE_INPLACE));
+    newInputBuffers.push_back(state.getBuffer(rewriter, opOperand->get()));
   }
 
   // New output operands for the cloned op.
   SmallVector<Value> newOutputBuffers;
   for (OpResult opResult : op->getOpResults()) {
-    SmallVector<OpOperand *> aliasingOpOperands =
-        state.getAnalysisState().getAliasingOpOperand(opResult);
-    assert(aliasingOpOperands.size() == 1 && "expected 1 OpOperand");
-    FailureOr<Value> resultBuffer =
-        state.getBuffer(rewriter, *aliasingOpOperands.front());
-    if (failed(resultBuffer))
-      return failure();
-    newOutputBuffers.push_back(*resultBuffer);
+    OpOperand *opOperand = op.getOutputOperand(opResult.getResultNumber());
+    Value resultBuffer = state.getBuffer(rewriter, opOperand->get());
+    newOutputBuffers.push_back(resultBuffer);
   }
 
   // Merge input/output operands.

diff  --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 0dd5c909b8be..55b651802884 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -313,10 +313,8 @@ static SmallVector<Value> getBuffers(RewriterBase &rewriter,
   SmallVector<Value> result;
   for (OpOperand &opOperand : operands) {
     if (opOperand.get().getType().isa<TensorType>()) {
-      FailureOr<Value> resultBuffer = state.getBuffer(rewriter, opOperand);
-      if (failed(resultBuffer))
-        return {};
-      result.push_back(*resultBuffer);
+      Value resultBuffer = state.getBuffer(rewriter, opOperand.get());
+      result.push_back(resultBuffer);
     } else {
       result.push_back(opOperand.get());
     }
@@ -325,55 +323,13 @@ static SmallVector<Value> getBuffers(RewriterBase &rewriter,
 }
 
 /// Helper function for loop bufferization. Compute the buffer that should be
-/// yielded from a loop block (loop body or loop condition). If the given tensor
-/// is equivalent to the corresponding block argument (as indicated by
-/// `isEquivalent`), the buffer can be yielded directly. Otherwise, a new buffer
-/// copy must be yielded.
-///
-/// According to the `BufferizableOpInterface` implementation of scf loops, a
-/// a bufferized OpResult may alias only with the corresponding bufferized
-/// init_arg and with no other buffers. I.e., the i-th OpResult may alias with
-/// the i-th init_arg; but not with any other OpOperand. If a corresponding
-/// OpResult/init_arg pair bufferized to equivalent buffers (as indicated by
-/// `isEquivalent`), this aliasing requirement is satisfied. Otherwise, we
-/// cannot be sure and must yield a new buffer copy. (New buffer copies do not
-/// alias with any buffer.)
+/// yielded from a loop block (loop body or loop condition).
 static Value getYieldedBuffer(RewriterBase &rewriter, Value tensor,
-                              BaseMemRefType type, bool isEquivalent,
-                              BufferizationState &state) {
+                              BaseMemRefType type, BufferizationState &state) {
   assert(tensor.getType().isa<TensorType>() && "expected tensor");
   ensureToMemrefOpIsValid(tensor, type);
-  Value yieldedVal =
-      bufferization::lookupBuffer(rewriter, tensor, state.getOptions());
-
-  if (isEquivalent)
-    // Yielded value is equivalent to the corresponding iter_arg bbArg.
-    // Yield the value directly. Most IR should be like that. Everything
-    // else must be resolved with copies and is potentially inefficient.
-    // By default, such problematic IR would already have been rejected
-    // during `verifyAnalysis`, unless `allow-return-allocs`.
-    return castBuffer(rewriter, yieldedVal, type);
-
-  // It is not certain that the yielded value and the iter_arg bbArg
-  // have the same buffer. Allocate a new buffer and copy. The yielded
-  // buffer will get deallocated by `deallocateBuffers`.
-
-  // TODO: There are cases in which it is not neccessary to return a new
-  // buffer allocation. E.g., when equivalent values are yielded in a
-  // 
diff erent order. This could be resolved with copies.
-  Optional<Value> yieldedAlloc = state.createAlloc(
-      rewriter, tensor.getLoc(), yieldedVal, /*deallocMemref=*/false);
-  // TODO: We should rollback, but for now just assume that this always
-  // succeeds.
-  assert(yieldedAlloc.hasValue() && "could not create alloc");
-  LogicalResult copyStatus = state.getOptions().createMemCpy(
-      rewriter, tensor.getLoc(), yieldedVal, *yieldedAlloc);
-  (void)copyStatus;
-  assert(succeeded(copyStatus) && "could not create memcpy");
-
-  // The iter_arg memref type may have a layout map. Cast the new buffer
-  // to the same type if needed.
-  return castBuffer(rewriter, *yieldedAlloc, type);
+  Value yieldedVal = state.getBuffer(rewriter, tensor);
+  return castBuffer(rewriter, yieldedVal, type);
 }
 
 /// Helper function for loop bufferization. Given a range of values, apply
@@ -396,13 +352,12 @@ convertTensorValues(ValueRange values, const DenseSet<int64_t> &tensorIndices,
 SmallVector<Value> getYieldedValues(RewriterBase &rewriter, ValueRange values,
                                     TypeRange bufferizedTypes,
                                     const DenseSet<int64_t> &tensorIndices,
-                                    const DenseSet<int64_t> &equivalentTensors,
                                     BufferizationState &state) {
   return convertTensorValues(
       values, tensorIndices, [&](Value val, int64_t index) {
         return getYieldedBuffer(rewriter, val,
                                 bufferizedTypes[index].cast<BaseMemRefType>(),
-                                equivalentTensors.contains(index), state);
+                                state);
       });
 }
 
@@ -519,18 +474,11 @@ struct ForOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto forOp = cast<scf::ForOp>(op);
-    auto oldYieldOp =
-        cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
     Block *oldLoopBody = &forOp.getLoopBody().front();
 
     // Indices of all iter_args that have tensor type. These are the ones that
     // are bufferized.
     DenseSet<int64_t> indices = getTensorIndices(forOp.getInitArgs());
-    // For every yielded value, is the value equivalent to its corresponding
-    // bbArg?
-    DenseSet<int64_t> equivalentYields =
-        getEquivalentBuffers(forOp.getRegionIterArgs(), oldYieldOp.getResults(),
-                             state.getAnalysisState());
 
     // The new memref init_args of the loop.
     SmallVector<Value> initArgs =
@@ -562,9 +510,8 @@ struct ForOpInterface
     // Update scf.yield of new loop.
     auto yieldOp = cast<scf::YieldOp>(loopBody->getTerminator());
     rewriter.setInsertionPoint(yieldOp);
-    SmallVector<Value> yieldValues =
-        getYieldedValues(rewriter, yieldOp.getResults(), initArgsTypes, indices,
-                         equivalentYields, state);
+    SmallVector<Value> yieldValues = getYieldedValues(
+        rewriter, yieldOp.getResults(), initArgsTypes, indices, state);
     yieldOp.getResultsMutable().assign(yieldValues);
 
     // Replace loop results.
@@ -773,15 +720,6 @@ struct WhileOpInterface
     DenseSet<int64_t> indicesAfter =
         getTensorIndices(whileOp.getAfterArguments());
 
-    // For every yielded value, is the value equivalent to its corresponding
-    // bbArg?
-    DenseSet<int64_t> equivalentYieldsBefore = getEquivalentBuffers(
-        whileOp.getBeforeArguments(), whileOp.getConditionOp().getArgs(),
-        state.getAnalysisState());
-    DenseSet<int64_t> equivalentYieldsAfter = getEquivalentBuffers(
-        whileOp.getAfterArguments(), whileOp.getYieldOp().getResults(),
-        state.getAnalysisState());
-
     // The new memref init_args of the loop.
     SmallVector<Value> initArgs =
         getBuffers(rewriter, whileOp->getOpOperands(), state);
@@ -823,7 +761,7 @@ struct WhileOpInterface
     // TODO: This could be relaxed for better bufferization results.
     SmallVector<Value> newConditionArgs =
         getYieldedValues(rewriter, newConditionOp.getArgs(), argsTypesAfter,
-                         indicesAfter, equivalentYieldsBefore, state);
+                         indicesAfter, state);
     newConditionOp.getArgsMutable().assign(newConditionArgs);
 
     // Set up new iter_args and move the loop body block to the new op.
@@ -842,7 +780,7 @@ struct WhileOpInterface
     // TODO: This could be relaxed for better bufferization results.
     SmallVector<Value> newYieldValues =
         getYieldedValues(rewriter, newYieldOp.getResults(), argsTypesBefore,
-                         indicesBefore, equivalentYieldsAfter, state);
+                         indicesBefore, state);
     newYieldOp.getResultsMutable().assign(newYieldValues);
 
     // Replace loop results.
@@ -1023,16 +961,12 @@ struct ForeachThreadOpInterface
     // Gather new results of the ForeachThreadOp.
     SmallVector<Value> newResults;
     for (OpResult opResult : foreachThreadOp->getOpResults()) {
-      SmallVector<OpOperand *> insertDestOperands =
-          state.getAnalysisState().getAliasingOpOperand(opResult);
-      assert(insertDestOperands.size() == 1 &&
-             "expected exactly one aliasing OpOperand");
+      OpOperand *insertDest =
+          getInsertionDest(foreachThreadOp)[opResult.getResultNumber()];
       // Insert copies right before the PerformConcurrentlyOp terminator. They
       // should not be inside terminator (which would be the default insertion
       // point).
-      Value buffer = *state.getBuffer(b, *insertDestOperands.front(),
-                                      /*forceInPlace=*/llvm::None,
-                                      /*customCopyInsertionPoint=*/op);
+      Value buffer = state.getBuffer(b, insertDest->get());
       newResults.push_back(buffer);
     }
 
@@ -1089,7 +1023,7 @@ struct PerformConcurrentlyOpInterface
           PerformConcurrentlyOpInterface, PerformConcurrentlyOp> {
   LogicalResult bufferize(Operation *op, RewriterBase &b,
                           BufferizationState &state) const {
-    assert(false && "op does not have any tensor OpOperands / OpResults");
+    llvm_unreachable("op does not have any tensor OpOperands / OpResults");
     return failure();
   }
 };

diff  --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 0a48baab17e7..430b2f6df8aa 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -51,11 +52,8 @@ struct CastOpInterface
     auto castOp = cast<tensor::CastOp>(op);
 
     // The result buffer still has the old (pre-cast) type.
-    FailureOr<Value> resultBuffer =
-        state.getBuffer(rewriter, castOp->getOpOperand(0) /*source*/);
-    if (failed(resultBuffer))
-      return failure();
-    auto sourceMemRefType = resultBuffer->getType().cast<BaseMemRefType>();
+    Value resultBuffer = state.getBuffer(rewriter, castOp.source());
+    auto sourceMemRefType = resultBuffer.getType().cast<BaseMemRefType>();
     Attribute memorySpace = sourceMemRefType.getMemorySpace();
     TensorType resultTensorType =
         castOp.getResult().getType().cast<TensorType>();
@@ -70,11 +68,11 @@ struct CastOpInterface
                                           layout, memorySpace);
 
     // Replace the op with a memref.cast.
-    assert(memref::CastOp::areCastCompatible(resultBuffer->getType(),
+    assert(memref::CastOp::areCastCompatible(resultBuffer.getType(),
                                              resultMemRefType) &&
            "CallOp::bufferize: cast incompatible");
     replaceOpWithNewBufferizedOp<memref::CastOp>(rewriter, op, resultMemRefType,
-                                                 *resultBuffer);
+                                                 resultBuffer);
 
     return success();
   }
@@ -110,14 +108,11 @@ struct CollapseShapeOpInterface
                           BufferizationState &state) const {
     auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
     RankedTensorType tensorResultType = collapseShapeOp.getResultType();
-    OpOperand &srcOperand = collapseShapeOp->getOpOperand(0) /*src*/;
-    auto bufferType = state.getBufferType(srcOperand.get()).cast<MemRefType>();
+    Value buffer = state.getBuffer(rewriter, collapseShapeOp.src());
+    auto bufferType = buffer.getType().cast<MemRefType>();
 
     if (tensorResultType.getRank() == 0) {
       // 0-d collapses must go through a 
diff erent op builder.
-      auto buffer = state.getBuffer(rewriter, srcOperand);
-      if (failed(buffer))
-        return failure();
       MemRefType resultType;
 
       if (bufferType.getLayout().isIdentity()) {
@@ -140,7 +135,7 @@ struct CollapseShapeOpInterface
       }
 
       replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
-          rewriter, op, resultType, *buffer, collapseShapeOp.reassociation());
+          rewriter, op, resultType, buffer, collapseShapeOp.reassociation());
       return success();
     }
 
@@ -149,18 +144,23 @@ struct CollapseShapeOpInterface
     // newly allocated buffer will have no layout map and thus be collapsible.
     bool canBeCollapsed = memref::CollapseShapeOp::isGuaranteedCollapsible(
         bufferType, collapseShapeOp.getReassociationIndices());
-    Optional<BufferizationState::ForceInPlacability> overrideInPlace =
-        canBeCollapsed
-            ? None
-            : Optional<BufferizationState::ForceInPlacability>(
-                  BufferizationState::ForceInPlacability::FORCE_OUT_OF_PLACE);
-    auto buffer = state.getBuffer(rewriter, srcOperand, overrideInPlace);
-    if (failed(buffer))
-      return failure();
+    if (!canBeCollapsed) {
+      // TODO: Create alloc_tensor ops during TensorCopyInsertion.
+      AnalysisState analysisState(state.getOptions());
+      Value tensorAlloc = allocateTensorForShapedValue(
+          rewriter, op->getLoc(), collapseShapeOp.src(),
+          analysisState.isTensorYielded(collapseShapeOp.result()));
+      auto memrefType =
+          MemRefType::get(collapseShapeOp.getSrcType().getShape(),
+                          collapseShapeOp.getSrcType().getElementType(),
+                          AffineMap(), bufferType.getMemorySpaceAsInt());
+      buffer = rewriter.create<bufferization::ToMemrefOp>(
+          op->getLoc(), memrefType, tensorAlloc);
+    }
 
     // Result type is inferred by the builder.
     replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
-        rewriter, op, *buffer, collapseShapeOp.getReassociationIndices());
+        rewriter, op, buffer, collapseShapeOp.getReassociationIndices());
     return success();
   }
 };
@@ -187,11 +187,8 @@ struct DimOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto dimOp = cast<tensor::DimOp>(op);
-    auto v = state.getBuffer(rewriter, dimOp->getOpOperand(0) /*source*/);
-    if (failed(v))
-      return failure();
-    replaceOpWithNewBufferizedOp<memref::DimOp>(rewriter, op, *v,
-                                                dimOp.index());
+    auto v = state.getBuffer(rewriter, dimOp.source());
+    replaceOpWithNewBufferizedOp<memref::DimOp>(rewriter, op, v, dimOp.index());
     return success();
   }
 };
@@ -226,15 +223,12 @@ struct ExpandShapeOpInterface
                           BufferizationState &state) const {
     auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
     auto tensorResultType = expandShapeOp.getResultType();
-    auto buffer =
-        state.getBuffer(rewriter, expandShapeOp->getOpOperand(0) /*src*/);
-    if (failed(buffer))
-      return failure();
+    auto buffer = state.getBuffer(rewriter, expandShapeOp.src());
 
     // Memref result type is inferred by the builder based on reassociation
     // indices and result shape.
     replaceOpWithNewBufferizedOp<memref::ExpandShapeOp>(
-        rewriter, op, tensorResultType.getShape(), *buffer,
+        rewriter, op, tensorResultType.getShape(), buffer,
         expandShapeOp.getReassociationIndices());
     return success();
   }
@@ -273,34 +267,18 @@ struct ExtractSliceOpInterface
 
     // Even if this op was decided to bufferize out-of-place, do not insert the
     // buffer copy yet. This is done later in this function.
-    auto srcMemref =
-        state.getBuffer(rewriter, extractSliceOp->getOpOperand(0) /*source*/,
-                        BufferizationState::ForceInPlacability::FORCE_INPLACE);
-    if (failed(srcMemref))
-      return failure();
-    auto srcMemrefType = srcMemref->getType().cast<MemRefType>();
+    auto srcMemref = state.getBuffer(rewriter, extractSliceOp.source());
+    auto srcMemrefType = srcMemref.getType().cast<MemRefType>();
     auto dstTensorType =
         extractSliceOp.result().getType().cast<RankedTensorType>();
 
-    // If not inplaceable, alloc.
-    bool inplace =
-        state.getAnalysisState().isInPlace(extractSliceOp->getOpOperand(0));
-    Value alloc;
-    if (!inplace) {
-      FailureOr<Value> allocOrFailure =
-          state.createAlloc(rewriter, loc, extractSliceOp.result());
-      if (failed(allocOrFailure))
-        return failure();
-      alloc = *allocOrFailure;
-    }
-
     // Expand offsets, sizes and strides to the full rank to handle the
     // rank-reducing case.
     SmallVector<OpFoldResult> mixedOffsets = extractSliceOp.getMixedOffsets();
     SmallVector<OpFoldResult> mixedSizes = extractSliceOp.getMixedSizes();
     SmallVector<OpFoldResult> mixedStrides = extractSliceOp.getMixedStrides();
     OffsetSizeAndStrideOpInterface::expandToRank(
-        *srcMemref, mixedOffsets, mixedSizes, mixedStrides,
+        srcMemref, mixedOffsets, mixedSizes, mixedStrides,
         [&](Value target, int64_t dim) -> OpFoldResult {
           auto shapedType = target.getType().cast<ShapedType>();
           if (shapedType.isDynamicDim(dim))
@@ -313,19 +291,9 @@ struct ExtractSliceOpInterface
                                  mixedOffsets, mixedSizes, mixedStrides)
                                  .cast<MemRefType>();
     Value subView = rewriter.create<memref::SubViewOp>(
-        loc, subviewMemRefType, *srcMemref, mixedOffsets, mixedSizes,
+        loc, subviewMemRefType, srcMemref, mixedOffsets, mixedSizes,
         mixedStrides);
 
-    // If not inplaceable, copy.
-    if (!inplace) {
-      // Do not copy if the copied data is never read.
-      if (state.getAnalysisState().isValueRead(extractSliceOp.result()))
-        if (failed(state.getOptions().createMemCpy(
-                rewriter, extractSliceOp.getLoc(), subView, alloc)))
-          return failure();
-      subView = alloc;
-    }
-
     replaceOpWithBufferizedValues(rewriter, op, subView);
     return success();
   }
@@ -353,11 +321,8 @@ struct ExtractOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto extractOp = cast<tensor::ExtractOp>(op);
-    auto srcMemref =
-        state.getBuffer(rewriter, extractOp->getOpOperand(0) /*tensor*/);
-    if (failed(srcMemref))
-      return failure();
-    replaceOpWithNewBufferizedOp<memref::LoadOp>(rewriter, op, *srcMemref,
+    Value srcMemref = state.getBuffer(rewriter, extractOp.tensor());
+    replaceOpWithNewBufferizedOp<memref::LoadOp>(rewriter, op, srcMemref,
                                                  extractOp.indices());
     return success();
   }
@@ -397,11 +362,16 @@ struct FromElementsOpInterface
     Location loc = op->getLoc();
     auto tensorType = fromElementsOp.getType().cast<RankedTensorType>();
     auto shape = tensorType.getShape();
-    FailureOr<Value> maybeBuffer =
-        state.createAlloc(rewriter, loc, fromElementsOp.result());
-    if (failed(maybeBuffer))
-      return failure();
-    Value buffer = *maybeBuffer;
+    // TODO: Create alloc_tensor ops during TensorCopyInsertion.
+    AnalysisState analysisState(state.getOptions());
+    Value tensorAlloc = allocateTensorForShapedValue(
+        rewriter, loc, fromElementsOp.result(),
+        analysisState.isTensorYielded(fromElementsOp.result()),
+        /*copy=*/false);
+    auto memrefType =
+        MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    Value buffer = rewriter.create<bufferization::ToMemrefOp>(
+        op->getLoc(), memrefType, tensorAlloc);
 
     // Case: tensor<0xelem_type>.
     if (fromElementsOp.elements().empty()) {
@@ -442,15 +412,19 @@ struct GenerateOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto generateOp = cast<tensor::GenerateOp>(op);
-
+    auto tensorType = generateOp.getType().cast<RankedTensorType>();
     // Allocate memory.
     Location loc = op->getLoc();
-    FailureOr<Value> maybeResult =
-        state.createAlloc(rewriter, loc, generateOp.result());
-    if (failed(maybeResult))
-      return failure();
-    Value result = *maybeResult;
-    MemRefType memrefType = result.getType().cast<MemRefType>();
+    // TODO: Create alloc_tensor ops during TensorCopyInsertion.
+    AnalysisState analysisState(state.getOptions());
+    Value tensorAlloc = allocateTensorForShapedValue(
+        rewriter, loc, generateOp.result(),
+        analysisState.isTensorYielded(generateOp.result()),
+        /*copy=*/false);
+    auto memrefType =
+        MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    Value buffer = rewriter.create<bufferization::ToMemrefOp>(
+        op->getLoc(), memrefType, tensorAlloc);
 
     // Collect loop bounds.
     int64_t rank = memrefType.getRank();
@@ -483,10 +457,10 @@ struct GenerateOpInterface
     Operation *elementYield = parallelBody->getTerminator()->getPrevNode();
     rewriter.setInsertionPointAfter(elementYield);
     rewriter.replaceOpWithNewOp<memref::StoreOp>(
-        elementYield, elementYield->getOperands()[0], result,
+        elementYield, elementYield->getOperands()[0], buffer,
         parallelBody->getArguments());
 
-    replaceOpWithBufferizedValues(rewriter, op, result);
+    replaceOpWithBufferizedValues(rewriter, op, buffer);
     return success();
   }
 };
@@ -521,13 +495,10 @@ struct InsertOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto insertOp = cast<tensor::InsertOp>(op);
-    FailureOr<Value> destMemref =
-        state.getBuffer(rewriter, insertOp->getOpOperand(1) /*dest*/);
-    if (failed(destMemref))
-      return failure();
+    Value destMemref = state.getBuffer(rewriter, insertOp.dest());
     rewriter.create<memref::StoreOp>(insertOp.getLoc(), insertOp.scalar(),
-                                     *destMemref, insertOp.indices());
-    replaceOpWithBufferizedValues(rewriter, op, *destMemref);
+                                     destMemref, insertOp.indices());
+    replaceOpWithBufferizedValues(rewriter, op, destMemref);
     return success();
   }
 
@@ -682,12 +653,7 @@ struct InsertSliceOpInterface
     // TODO: be very loud about it or even consider failing the pass.
     auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
     Location loc = insertSliceOp.getLoc();
-
-    // When bufferizing out-of-place, `getResultBuffer` allocates.
-    FailureOr<Value> dstMemref =
-        state.getBuffer(rewriter, insertSliceOp->getOpOperand(1) /*dest*/);
-    if (failed(dstMemref))
-      return failure();
+    Value dstMemref = state.getBuffer(rewriter, insertSliceOp.dest());
 
     // Expand offsets, sizes and strides to the full rank to handle the
     // rank-reducing case.
@@ -695,7 +661,7 @@ struct InsertSliceOpInterface
     SmallVector<OpFoldResult> mixedSizes = insertSliceOp.getMixedSizes();
     SmallVector<OpFoldResult> mixedStrides = insertSliceOp.getMixedStrides();
     OffsetSizeAndStrideOpInterface::expandToRank(
-        *dstMemref, mixedOffsets, mixedSizes, mixedStrides,
+        dstMemref, mixedOffsets, mixedSizes, mixedStrides,
         [&](Value target, int64_t dim) -> OpFoldResult {
           auto shapedType = target.getType().cast<ShapedType>();
           if (shapedType.isDynamicDim(dim))
@@ -703,25 +669,24 @@ struct InsertSliceOpInterface
           return rewriter.getIndexAttr(shapedType.getDimSize(dim));
         });
     // Take a subview of the dst.
-    auto dstMemrefType = dstMemref->getType().cast<MemRefType>();
+    auto dstMemrefType = dstMemref.getType().cast<MemRefType>();
     auto subviewMemRefType =
         memref::SubViewOp::inferRankReducedResultType(
             insertSliceOp.getSourceType().getRank(), dstMemrefType,
             mixedOffsets, mixedSizes, mixedStrides)
             .cast<MemRefType>();
     Value subView = rewriter.create<memref::SubViewOp>(
-        loc, subviewMemRefType, *dstMemref, mixedOffsets, mixedSizes,
+        loc, subviewMemRefType, dstMemref, mixedOffsets, mixedSizes,
         mixedStrides);
 
     // Copy tensor. If this tensor.insert_slice has a matching
     // tensor.extract_slice, the copy operation will eventually fold away.
-    auto srcMemref =
-        state.getBuffer(rewriter, insertSliceOp->getOpOperand(0) /*source*/);
-    if (failed(srcMemref) || failed(state.getOptions().createMemCpy(
-                                 rewriter, loc, *srcMemref, subView)))
+    auto srcMemref = state.getBuffer(rewriter, insertSliceOp.source());
+    if (failed(
+            state.getOptions().createMemCpy(rewriter, loc, srcMemref, subView)))
       return failure();
 
-    replaceOpWithBufferizedValues(rewriter, op, *dstMemref);
+    replaceOpWithBufferizedValues(rewriter, op, dstMemref);
     return success();
   }
 };
@@ -748,11 +713,9 @@ struct RankOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto rankOp = cast<tensor::RankOp>(op);
-    auto v = state.getBuffer(rewriter, rankOp->getOpOperand(0) /*source*/);
-    if (failed(v))
-      return failure();
+    auto v = state.getBuffer(rewriter, rankOp.tensor());
     replaceOpWithNewBufferizedOp<memref::RankOp>(rewriter, op, rankOp.getType(),
-                                                 *v);
+                                                 v);
     return success();
   }
 };
@@ -786,21 +749,12 @@ struct ReshapeOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
-    auto &srcOperand = reshapeOp->getOpOperand(0);
-    auto srcBuffer = state.getBuffer(rewriter, srcOperand);
-    if (failed(srcBuffer))
-      return failure();
-
-    auto &shapeOperand = reshapeOp->getOpOperand(1);
-    auto shapeBuffer = state.getBuffer(rewriter, shapeOperand);
-    if (failed(shapeBuffer))
-      return failure();
-
+    Value srcBuffer = state.getBuffer(rewriter, reshapeOp.source());
+    Value shapeBuffer = state.getBuffer(rewriter, reshapeOp.shape());
     auto resultTensorType = reshapeOp.getResult().getType().cast<TensorType>();
     auto resultMemRefType = getMemRefType(resultTensorType, state.getOptions());
-
     replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
-        rewriter, op, resultMemRefType, *srcBuffer, *shapeBuffer);
+        rewriter, op, resultMemRefType, srcBuffer, shapeBuffer);
     return success();
   }
 };

diff  --git a/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
index dd834306c3e7..b7344ee79481 100644
--- a/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -50,10 +50,7 @@ struct TransferReadOpInterface
     auto readOp = cast<vector::TransferReadOp>(op);
     assert(readOp.getShapedType().isa<TensorType>() &&
            "only tensor types expected");
-
-    // TransferReadOp always reads from the bufferized op.source().
-    Value buffer =
-        *state.getBuffer(rewriter, readOp->getOpOperand(0) /*source*/);
+    Value buffer = state.getBuffer(rewriter, readOp.getSource());
     replaceOpWithNewBufferizedOp<vector::TransferReadOp>(
         rewriter, readOp, readOp.getVectorType(), buffer, readOp.getIndices(),
         readOp.getPermutationMap(), readOp.getPadding(), readOp.getMask(),
@@ -100,17 +97,12 @@ struct TransferWriteOpInterface
            "only tensor types expected");
 
     // Create a new transfer_write on buffer that doesn't have a return value.
-    // Leave the previous transfer_write to dead code as it still has uses at
-    // this point.
-    FailureOr<Value> resultBuffer =
-        state.getBuffer(rewriter, op->getOpOperand(1) /*source*/);
-    if (failed(resultBuffer))
-      return failure();
+    Value resultBuffer = state.getBuffer(rewriter, writeOp.getSource());
     rewriter.create<vector::TransferWriteOp>(
-        writeOp.getLoc(), writeOp.getVector(), *resultBuffer,
+        writeOp.getLoc(), writeOp.getVector(), resultBuffer,
         writeOp.getIndices(), writeOp.getPermutationMapAttr(),
         writeOp.getInBoundsAttr());
-    replaceOpWithBufferizedValues(rewriter, op, *resultBuffer);
+    replaceOpWithBufferizedValues(rewriter, op, resultBuffer);
 
     return success();
   }

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
index dd7dd54b3a70..2bea701e24b0 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
@@ -9,18 +9,17 @@ func.func @buffer_forwarding_conflict(
     -> (tensor<?xf32>, tensor<?xf32>)
 {
   %f0 = arith.constant 0.0: f32
+
+  //     CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
+  //     CHECK: linalg.fill ins({{.*}} : f32) outs(%[[EXTRACT_SLICE_ALLOC]] : memref<?xf32>)
   // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis).
   //     CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]]
   // This allocs the whole dim to allow for a full clone of t.
   //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
-
   // alloc_tensor itself does not alloc but forwards to the **second**
   // insert_slice. AllocTensorOp replaces the alloc_tensor with an out-of-place
   // extract_slice.
-  //     CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
   %a = bufferization.alloc_tensor(%sz) : tensor<?xf32>
-
-  //     CHECK: linalg.fill ins({{.*}} : f32) outs(%[[EXTRACT_SLICE_ALLOC]] : memref<?xf32>)
   %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
 
   //     CHECK: memref.copy %[[FUNC_ARG]], %[[ALLOC]] : memref<?xf32> to memref<?xf32>

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
index 1eb1b4cac9f6..0874912323c5 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -8,8 +8,8 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=tensor allow-unknown-ops allow-return-allocs" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=scf allow-unknown-ops allow-return-allocs" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=tensor,bufferization allow-unknown-ops allow-return-allocs" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=scf,bufferization allow-unknown-ops allow-return-allocs" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
 
 // CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
@@ -141,14 +141,13 @@ func.func @unknown_op_may_read(%v: vector<5xf32>)
   // One alloc for the alloc_tensor, another one because the transfer_write
   // bufferizes out-of-place.
   // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32>
-  // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
-  %t1 = bufferization.alloc_tensor() : tensor<10xf32>
-
   // CHECK: linalg.fill ins(%{{.*}}{{.*}}outs(%[[m1]]
   // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  %t1 = bufferization.alloc_tensor() : tensor<10xf32>
   %filled = linalg.fill ins(%cst : f32) outs(%t1 : tensor<10xf32>) -> tensor<10xf32>
 
   // The transfer_write is out-of-place because "dummy_op" may read.
+  // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: memref.copy %[[m1]], %[[alloc]]
   // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
   // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc]]
@@ -193,10 +192,10 @@ func.func @simple_tensor_test(%t1 : tensor<?xf32>, %f : f32) -> tensor<?xf32> {
   // CHECK-TENSOR: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]]
   %c0 = arith.constant 0 : index
   // CHECK-TENSOR: %[[alloc:.*]] = memref.alloc
-  // CHECK-TENSOR: %[[casted_alloc:.*]] = bufferization.to_tensor %[[alloc]]
   // CHECK-TENSOR: memref.copy %[[t1_memref]], %[[alloc]]
   // CHECK-TENSOR: memref.store %{{.*}}, %[[alloc]]
   %0 = tensor.insert %f into %t1[%c0] : tensor<?xf32>
+  // CHECK-TENSOR: %[[casted_alloc:.*]] = bufferization.to_tensor %[[alloc]]
   // CHECK-TENSOR: return %[[casted_alloc]]
   return %0 : tensor<?xf32>
 }

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
index 3d682c98d3f1..175803fc7c59 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
@@ -48,12 +48,11 @@ func.func @return_slice(%t: tensor<?xf32>, %sz: index) -> (tensor<?xf32>) {
 
 // CHECK-LABEL: func @main(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[call:.*]] = call @return_slice(%[[t]]
 //       CHECK:   %[[alloc:.*]] = memref.alloc
-//   CHECK-DAG:   memref.copy %[[t]], %[[alloc]]
-//   CHECK-DAG:   %[[casted:.*]] = memref.cast %[[alloc]]
-//       CHECK:   %[[call:.*]] = call @return_slice(%[[casted]]
+//       CHECK:   memref.copy %[[call]], %[[alloc]]
 //       CHECK:   linalg.fill ins({{.*}}) outs(%[[t]]
-//       CHECK:   memref.load %[[call]]
+//       CHECK:   memref.load %[[alloc]]
 //       CHECK:   memref.load %[[t]]
 func.func @main(%t: tensor<?xf32>, %sz: index, %idx: index) -> (f32, f32) {
   %cst = arith.constant 1.0 : f32

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index 470caa175f6e..79535bbe9e7d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -528,12 +528,12 @@ func.func @entry(%A : tensor<?xf32> {bufferization.buffer_layout = affine_map<(i
 // conflict. However, inside `entry`, the writes do cause a conflict because
 // %A, %B and %C are not inplaceable. This test case shows that this kind of
 // conflict detection has a "transitive" nature.
-//  CHECK-DAG: %[[ALLOC_C:.*]] = memref.alloc
-//  CHECK-DAG: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]]
-//  CHECK-DAG: %[[ALLOC_B:.*]] = memref.alloc
-//  CHECK-DAG: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]]
 //  CHECK-DAG: %[[ALLOC_A:.*]] = memref.alloc
 //  CHECK-DAG: %[[CASTED_A:.*]] = memref.cast %[[ALLOC_A]]
+//  CHECK-DAG: %[[ALLOC_B:.*]] = memref.alloc
+//  CHECK-DAG: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]]
+//  CHECK-DAG: %[[ALLOC_C:.*]] = memref.alloc
+//  CHECK-DAG: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]]
 //  CHECK-DAG: memref.copy %[[A]], %[[ALLOC_A]]
 //  CHECK-DAG: memref.copy %[[B]], %[[ALLOC_B]]
 //  CHECK-DAG: memref.copy %[[C]], %[[ALLOC_C]]

diff  --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index c15d3cd86bf9..eb89907d71b0 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -71,8 +71,8 @@ func.func @init_tensor(%in : tensor<?xf32>, %size: index) -> tensor<?xf32> {
 #map0 = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL:   func @multiple_results
-// CHECK:           %[[RESULT1:.*]] = memref.alloc() {{.*}} : memref<4xf32>
 // CHECK:           %[[RESULT0:.*]] = memref.alloc() {{.*}} : memref<4xf32>
+// CHECK:           %[[RESULT1:.*]] = memref.alloc() {{.*}} : memref<4xf32>
 // CHECK:           linalg.generic
 // CHECK-SAME:      ins(%{{.*}} : memref<4xf32>)
 // CHECK-SAME:      outs(%[[RESULT0]], %[[RESULT1]] : memref<4xf32>, memref<4xf32>)
@@ -101,11 +101,11 @@ func.func @multiple_results(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf3
 // CHECK-SAME:                          %[[ARG:.*]]: tensor<?x?xf32>
 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[DIM0:.*]] = tensor.dim %[[ARG]], %[[C0]] : tensor<?x?xf32>
-// CHECK:           %[[DIM1:.*]] = tensor.dim %[[ARG]], %[[C1]] : tensor<?x?xf32>
-// CHECK:           %[[RESULT1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
-// CHECK:           %[[RESULT0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
-// CHECK:           %[[MEMREF_ARG:.*]] = bufferization.to_memref %[[ARG]] : memref<?x?xf32>
+// CHECK-DAG:       %[[DIM0:.*]] = tensor.dim %[[ARG]], %[[C0]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[DIM1:.*]] = tensor.dim %[[ARG]], %[[C1]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[RESULT0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
+// CHECK-DAG:       %[[RESULT1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
+// CHECK-DAG:       %[[MEMREF_ARG:.*]] = bufferization.to_memref %[[ARG]] : memref<?x?xf32>
 // CHECK:           linalg.generic
 // CHECK-SAME:      ins(%[[MEMREF_ARG]] : memref<?x?xf32>)
 // CHECK-SAME:      outs(%[[RESULT0]], %[[RESULT1]] : memref<?x?xf32>, memref<?x?xf32>)

diff  --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index 2eacdd375610..d06770411275 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -481,23 +481,20 @@ func.func @scf_while_non_equiv_condition_and_body(%arg0: tensor<5xi1>,
 
 // CHECK-LABEL: func @scf_while_iter_arg_result_mismatch(
 //  CHECK-SAME:     %[[arg0:.*]]: memref<5xi1, #{{.*}}>, %[[arg1:.*]]: memref<5xi1, #{{.*}}>
-//       CHECK:   %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5xi1>
 //       CHECK:   %[[clone:.*]] = bufferization.clone %[[arg1]]
 //       CHECK:   scf.while (%[[arg3:.*]] = %[[clone]]) : (memref<5xi1, #{{.*}}) -> () {
 //   CHECK-DAG:     memref.dealloc %[[arg3]]
 //   CHECK-DAG:     %[[load:.*]] = memref.load %[[arg0]]
 //       CHECK:     scf.condition(%[[load]])
 //       CHECK:   } do {
+//       CHECK:     %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5xi1>
 //       CHECK:     memref.copy %[[arg0]], %[[alloc2]]
 //       CHECK:     memref.store %{{.*}}, %[[alloc2]]
-//       CHECK:     %[[alloc1:.*]] = memref.alloc() {{.*}} : memref<5xi1>
-//       CHECK:     memref.copy %[[alloc2]], %[[alloc1]]
-//       CHECK:     %[[casted:.*]] = memref.cast %[[alloc1]] : memref<5xi1> to memref<5xi1, #{{.*}}>
+//       CHECK:     %[[casted:.*]] = memref.cast %[[alloc2]] : memref<5xi1> to memref<5xi1, #{{.*}}>
 //       CHECK:     %[[cloned:.*]] = bufferization.clone %[[casted]]
-//       CHECK:     memref.dealloc %[[alloc1]]
+//       CHECK:     memref.dealloc %[[alloc2]]
 //       CHECK:     scf.yield %[[cloned]]
 //       CHECK:   }
-//   CHECK-DAG:   memref.dealloc %[[alloc2]]
 func.func @scf_while_iter_arg_result_mismatch(%arg0: tensor<5xi1>,
                                               %arg1: tensor<5xi1>,
                                               %arg2: index) {

diff  --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index 0d05f0ff9264..7249d546de44 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -22,24 +22,22 @@ func.func @insert_slice_fun(
     %t1 : tensor<4xf32> {bufferization.writable = true})
   ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
 {
-  // Hoisted allocs.
-  //      CHECK: %[[REALLOC1:.*]] = memref.alloc
-  //      CHECK: %[[REALLOC2:.*]] = memref.alloc
-  //      CHECK: %[[REALLOC3:.*]] = memref.alloc
-
   // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC3:.*]] = memref.alloc
   //      CHECK: memref.copy %[[A0]], %[[REALLOC3]]
   //      CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC3]]
   //      CHECK: memref.copy %[[t0]], %[[SV_A0]]
   %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
 
   // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC2:.*]] = memref.alloc
   //      CHECK: memref.copy %[[A0]]
   //      CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC2]]
   //      CHECK: memref.copy %[[t1]], %[[SV_A0_2]]
   %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
 
   //  Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
+  //      CHECK: %[[REALLOC1:.*]] = memref.alloc
   //      CHECK: memref.copy %[[A1]]
   //      CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC1]]
   //      CHECK: memref.copy %[[t0]], %[[SV_A1]]