[Mlir-commits] [mlir] 25bf6a2 - Revert "[mlir] Purge `linalg.copy` and use `memref.copy` instead."

Mon Jan 31 09:52:47 PST 2022

Author: Alexander Belyaev
Date: 2022-01-31T18:51:39+01:00
New Revision: 25bf6a2a9bc6ecb3792199490c70c4ce50a94aea

URL: https://github.com/llvm/llvm-project/commit/25bf6a2a9bc6ecb3792199490c70c4ce50a94aea
DIFF: https://github.com/llvm/llvm-project/commit/25bf6a2a9bc6ecb3792199490c70c4ce50a94aea.diff

LOG: Revert "[mlir] Purge `linalg.copy` and use `memref.copy` instead."

This reverts commit 016956b68081705ffee511c334e31e414fa1ddbf.
Reverting it to fix NVidia build without being in a hurry.

Added: 
    mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir

Modified: 
    mlir/docs/Dialects/Linalg/_index.md
    mlir/include/mlir/Conversion/LinalgToStandard/LinalgToStandard.h
    mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
    mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
    mlir/include/mlir/Dialect/Linalg/Passes.td
    mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
    mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
    mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
    mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
    mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
    mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
    mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
    mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
    mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
    mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
    mlir/test/Dialect/Linalg/bufferize.mlir
    mlir/test/Dialect/Linalg/canonicalize.mlir
    mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
    mlir/test/Dialect/Linalg/fusion-pattern.mlir
    mlir/test/Dialect/Linalg/loops.mlir
    mlir/test/Dialect/Linalg/promote.mlir
    mlir/test/Dialect/Linalg/promotion_options.mlir
    mlir/test/Dialect/Linalg/roundtrip.mlir
    mlir/test/Dialect/Linalg/standard.mlir
    mlir/test/Dialect/Linalg/transform-patterns.mlir
    mlir/test/Dialect/Linalg/vectorization.mlir
    mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
    mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
    mlir/test/Transforms/canonicalize.mlir
    mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
    mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
    mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/docs/Dialects/Linalg/_index.md b/mlir/docs/Dialects/Linalg/_index.md
index da902a395a68..790f858dad26 100644

--- a/mlir/docs/Dialects/Linalg/_index.md
+++ b/mlir/docs/Dialects/Linalg/_index.md
@@ -545,6 +545,7 @@ seem generally appealing.
 Additionally, `linalg` provides a small subset of commonly named operations:
 
 ```
+* `linalg.copy`,
 * `linalg.fill`,
 * `linalg.dot`,
 * `linalg.matmul`,

diff  --git a/mlir/include/mlir/Conversion/LinalgToStandard/LinalgToStandard.h b/mlir/include/mlir/Conversion/LinalgToStandard/LinalgToStandard.h
index ff538ac8ec1c..88323682d3ee 100644
--- a/mlir/include/mlir/Conversion/LinalgToStandard/LinalgToStandard.h
+++ b/mlir/include/mlir/Conversion/LinalgToStandard/LinalgToStandard.h
@@ -39,6 +39,25 @@ class LinalgOpToLibraryCallRewrite
                                 PatternRewriter &rewriter) const override;
 };
 
+/// Rewrite pattern specialization for CopyOp, kicks in when both input and
+/// output permutations are left unspecified or are the identity.
+class CopyOpToLibraryCallRewrite : public OpRewritePattern<CopyOp> {
+public:
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(CopyOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Rewrite CopyOp with permutations into a sequence of TransposeOp and
+/// permutation-free CopyOp. This interplays with TransposeOpConversion and
+/// LinalgConversion<CopyOp> to create a path to the LLVM dialect.
+class CopyTransposeRewrite : public OpRewritePattern<CopyOp> {
+public:
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(CopyOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
 /// Populate the given list with patterns that convert from Linalg to Standard.
 void populateLinalgToStandardConversionPatterns(RewritePatternSet &patterns);
 

diff  --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index ad2062a45e77..9890383931cb 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -42,7 +42,7 @@ def BufferDeallocation : Pass<"buffer-deallocation", "FuncOp"> {
         }: memref<2xf32>, memref<2xf32>
         br ^bb3(%0 : memref<2xf32>)
       ^bb3(%1: memref<2xf32>):
-        "memref.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
+        "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
         return
       }
     }
@@ -58,7 +58,7 @@ def BufferDeallocation : Pass<"buffer-deallocation", "FuncOp"> {
         cond_br %arg0, ^bb1, ^bb2
       ^bb1:  // pred: ^bb0
         %0 = memref.alloc() : memref<2xf32>
-        memref.copy(%arg1, %0) : memref<2xf32>, memref<2xf32>
+        linalg.copy(%arg1, %0) : memref<2xf32>, memref<2xf32>
         br ^bb3(%0 : memref<2xf32>)
       ^bb2:  // pred: ^bb0
         %1 = memref.alloc() : memref<2xf32>
@@ -72,11 +72,11 @@ def BufferDeallocation : Pass<"buffer-deallocation", "FuncOp"> {
           linalg.yield %4 : f32
         }: memref<2xf32>, memref<2xf32>
         %2 = memref.alloc() : memref<2xf32>
-        memref.copy(%1, %2) : memref<2xf32>, memref<2xf32>
+        linalg.copy(%1, %2) : memref<2xf32>, memref<2xf32>
         dealloc %1 : memref<2xf32>
         br ^bb3(%2 : memref<2xf32>)
       ^bb3(%3: memref<2xf32>):  // 2 preds: ^bb1, ^bb2
-        memref.copy(%3, %arg2) : memref<2xf32>, memref<2xf32>
+        linalg.copy(%3, %arg2) : memref<2xf32>, memref<2xf32>
         dealloc %3 : memref<2xf32>
         return
       }

diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 528adbd3e99a..9ea1a0bd3805 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -16,6 +16,7 @@
 
 include "mlir/Dialect/Linalg/IR/LinalgBase.td"
 include "mlir/Dialect/Linalg/IR/LinalgInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -56,6 +57,119 @@ class LinalgStructured_Op<string mnemonic, list<OpTrait> props>
 //===----------------------------------------------------------------------===//
 // Named Linalg ops, implemented as special configurations of generic ops.
 //===----------------------------------------------------------------------===//
+// At the moment these are not declarative and require a bunch of C++ code.
+// In the future, these should be migrated to a declarative specification.
+def CopyOp : LinalgStructured_Op<"copy", [CopyOpInterface]> {
+  let description = [{
+    Copies the data in the input view into the output view.
+
+    Usage:
+
+    ```mlir
+    linalg.copy(%arg0, %arg1) : memref<?xf32, stride_specification>,
+                                memref<?xf32, stride_specification>
+    ```
+
+    One possible lowering to loop form is:
+
+    ```mlir
+    %0 = linalg.dim %arg0, 0 : index
+    scf.for %i0 = %c0 to %0 step %c1 {
+      %1 = load %arg0[%i0] : memref<?xf32, stride_specification>
+      store %1, %arg1[%i0] : memref<?xf32, stride_specification>
+    }
+    ```
+
+    Optionally, can take `input_permutation` and `output_permutation` attributes
+    to reorder the dimensions of the input and output views.
+
+    Usage:
+
+    ```mlir
+    linalg.copy(%arg0, %arg1) {inputPermutation : (i, j, k) -> (i, k, j),
+                               outputPermutation : (i, j, k) -> (k, j, i)} :
+      memref<?x?x?xf32, stride_specification>,
+      memref<?x?x?xf32, stride_specification>
+    ```
+
+    One possible lowering to loop form is:
+
+    ```mlir
+    %0 = linalg.dim %arg0, 0
+    %1 = linalg.dim %arg0, 1
+    %2 = linalg.dim %arg0, 2
+    scf.for %i0 = %c0 to %{{.*}} step %c1 {
+      scf.for %i1 = %c0 to %{{.*}} step %c1 {
+        scf.for %i2 = %c0 to %{{.*}} step %c1 {
+          %3 = load %arg0[%i0, %i2, %i1] :
+                  memref<?x?x?xf32, stride_specification>
+          store %3, %arg1[%i2, %i1, %i0] :
+                  memref<?x?x?xf32, stride_specification>
+    ```
+
+    The views are expected to be compatible for correctness but this is not
+    enforced at the moment.
+  }];
+
+  let arguments = (ins
+    AnyStridedMemRef:$input,
+    AnyStridedMemRef:$output,
+    OptionalAttr<AffineMapAttr>:$inputPermutation,
+    OptionalAttr<AffineMapAttr>:$outputPermutation);
+  let regions = (region AnyRegion:$region);
+
+  let builders = [
+    OpBuilder<(ins "Value":$input, "Value":$output,
+      CArg<"AffineMap", "AffineMap()">:$inputPermutation,
+      CArg<"AffineMap", "AffineMap()">:$outputPermutation,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>];
+
+  let extraClassDeclaration = structuredOpsDecls # [{
+    ValueRange inputs() { return getOperands().take_front(); }
+    ValueRange outputs() { return getOperands().take_back(); }
+
+    // Rank-polymorphic.
+    //   filling_value -> O(ivs) with parallel iterators.
+    ArrayAttr iterator_types() {
+      int64_t nPar = getRank(getInputOperand(0));
+      return Builder(getContext()).getStrArrayAttr(
+        SmallVector<StringRef, 8>(nPar, getParallelIteratorTypeName()));
+    }
+
+    // I(input_perm(ivs)) -> O(output_perm(ivs))
+    ArrayAttr indexing_maps() {
+      MLIRContext *context = getContext();
+      auto maybeInputMap = inputPermutation();
+      auto maybeOutputMap = outputPermutation();
+      int64_t inputRank = getRank(getInputOperand(0));
+      int64_t outputRank = getRank(getOutputOperand(0));
+      return Builder(getContext()).getAffineMapArrayAttr({
+          extractOrIdentityMap(maybeInputMap, inputRank, context),
+          extractOrIdentityMap(maybeOutputMap, outputRank, context)});
+    }
+
+    Value getSource() { return input();}
+    Value getTarget() { return output(); }
+
+    static void regionBuilder(ImplicitLocOpBuilder &b, Block &block);
+    static std::function<void(ImplicitLocOpBuilder &b, Block &block)>
+    getRegionBuilder() {
+      return ®ionBuilder;
+    }
+    static unsigned getNumRegionArgs() { return 2; }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+
+  let assemblyFormat = [{
+    `(` $input `,` $output `)` attr-dict `:`
+        type($input) `,` type($output)
+      custom<CopyOpRegion>($region, ref(type($input)), ref(type($input)))
+  }];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+  let skipDefaultBuilders = 1;
+}
 
 def FillOp : LinalgStructured_Op<"fill", []> {
   let arguments = (ins

diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index a90376cbd9a7..fac60e2fd2b8 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -52,7 +52,7 @@ def LinalgComprehensiveModuleBufferize :
     Option<"useAlloca", "use-alloca", "bool",
            /*default=*/"false",
            "Use stack allocations for memrefs (for testing purposes only)">,
-    Option<"useLinalgCopy", "use-memref.copy", "bool",
+    Option<"useLinalgCopy", "use-linalg-copy", "bool",
            /*default=*/"false",
            "Use a copy operation implemented as a Linalg op.">,
     Option<"fullyDynamicLayoutMaps", "fully-dynamic-layout-maps", "bool",

diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 9e2a5650c4df..587df790f1d5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -349,7 +349,7 @@ struct LinalgPromotionOptions {
     return *this;
   }
   /// Callback function to do the copy of data to and from the promoted
-  /// subview. If None then a memref.copy is used.
+  /// subview. If None then a linalg.copy is used.
   Optional<CopyCallbackFn> copyInFn = None;
   Optional<CopyCallbackFn> copyOutFn = None;
   LinalgPromotionOptions &setCopyInOutFns(CopyCallbackFn const &copyIn,
@@ -390,9 +390,6 @@ FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
 /// Emit a suitable vector form for a Linalg op with fully static shape.
 LogicalResult vectorize(RewriterBase &builder, LinalgOp linalgOp);
 
-/// Emit a suitable vector form for a Copy op with fully static shape.
-LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
-
 /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`.
 FailureOr<LinalgLoops> linalgOpToLoops(PatternRewriter &rewriter,
                                        LinalgOp linalgOp);
@@ -937,15 +934,6 @@ struct LinalgVectorizationPattern : public OpInterfaceRewritePattern<LinalgOp> {
   LinalgTransformationFilter filter;
 };
 
-/// `filter` controls LinalgTransformMarker matching and update when specified.
-/// See `vectorizeLinalgOp` for more details.
-struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
-  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
-                                PatternRewriter &rewriter) const override;
-};
-
 /// Return vector::CombiningKind for the given op.
 llvm::Optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp);
 
@@ -1218,7 +1206,7 @@ void populatePadOpVectorizationPatterns(RewritePatternSet &patterns,
 ///    %subView = subview %allocOrView ...
 ///    [optional] linalg.fill(%allocOrView, %cst) ...
 ///    ...
-///    memref.copy(%in, %subView) ...
+///    linalg.copy(%in, %subView) ...
 ///    vector.transfer_read %allocOrView[...], %cst ...
 /// ```
 /// into
@@ -1229,8 +1217,8 @@ void populatePadOpVectorizationPatterns(RewritePatternSet &patterns,
 ///    ...
 ///    vector.transfer_read %in[...], %cst ...
 /// ```
-/// Where there is no interleaved use between memref.copy and transfer_read as
-/// well as no interleaved use between linalg.fill and memref.copy (if
+/// Where there is no interleaved use between linalg.copy and transfer_read as
+/// well as no interleaved use between linalg.fill and linalg.copy (if
 /// linalg.fill is specified).
 /// This is a custom rewrite to forward partial reads (with optional fills) to
 /// vector.transfer_read.
@@ -1249,7 +1237,7 @@ struct LinalgCopyVTRForwardingPattern
 ///    %subView = subview %allocOrView...
 ///    ...
 ///    vector.transfer_write %..., %allocOrView[...]
-///    memref.copy(%subView, %out)
+///    linalg.copy(%subView, %out)
 /// ```
 /// into
 /// ```
@@ -1259,7 +1247,7 @@ struct LinalgCopyVTRForwardingPattern
 ///    ...
 ///    vector.transfer_write %..., %out[...]
 /// ```
-/// Where there is no interleaved use between transfer_write and memref.copy.
+/// Where there is no interleaved use between transfer_write and linalg.copy.
 /// This is a custom rewrite to forward partial writes to vector.transfer_write.
 struct LinalgCopyVTWForwardingPattern
     : public OpRewritePattern<vector::TransferWriteOp> {

diff  --git a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
index 78512f8326fe..ddc7623a6244 100644
--- a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
+++ b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
@@ -96,6 +96,10 @@ createTypeCanonicalizedMemRefOperands(OpBuilder &b, Location loc,
 
 LogicalResult mlir::linalg::LinalgOpToLibraryCallRewrite::matchAndRewrite(
     LinalgOp op, PatternRewriter &rewriter) const {
+  // Only LinalgOp for which there is no specialized pattern go through this.
+  if (isa<CopyOp>(op))
+    return failure();
+
   auto libraryCallName = getLibraryCallSymbolRef(op, rewriter);
   if (!libraryCallName)
     return failure();
@@ -109,12 +113,65 @@ LogicalResult mlir::linalg::LinalgOpToLibraryCallRewrite::matchAndRewrite(
   return success();
 }
 
+LogicalResult mlir::linalg::CopyOpToLibraryCallRewrite::matchAndRewrite(
+    CopyOp op, PatternRewriter &rewriter) const {
+  auto inputPerm = op.inputPermutation();
+  if (inputPerm.hasValue() && !inputPerm->isIdentity())
+    return failure();
+  auto outputPerm = op.outputPermutation();
+  if (outputPerm.hasValue() && !outputPerm->isIdentity())
+    return failure();
+
+  auto libraryCallName = getLibraryCallSymbolRef(op, rewriter);
+  if (!libraryCallName)
+    return failure();
+
+  rewriter.replaceOpWithNewOp<mlir::CallOp>(
+      op, libraryCallName.getValue(), TypeRange(),
+      createTypeCanonicalizedMemRefOperands(rewriter, op.getLoc(),
+                                            op.getOperands()));
+  return success();
+}
+
+LogicalResult mlir::linalg::CopyTransposeRewrite::matchAndRewrite(
+    CopyOp op, PatternRewriter &rewriter) const {
+  Value in = op.input(), out = op.output();
+
+  // If either inputPerm or outputPerm are non-identities, insert transposes.
+  auto inputPerm = op.inputPermutation();
+  if (inputPerm.hasValue() && !inputPerm->isIdentity())
+    in = rewriter.create<memref::TransposeOp>(op.getLoc(), in,
+                                              AffineMapAttr::get(*inputPerm));
+  auto outputPerm = op.outputPermutation();
+  if (outputPerm.hasValue() && !outputPerm->isIdentity())
+    out = rewriter.create<memref::TransposeOp>(op.getLoc(), out,
+                                               AffineMapAttr::get(*outputPerm));
+
+  // If nothing was transposed, fail and let the conversion kick in.
+  if (in == op.input() && out == op.output())
+    return failure();
+
+  auto libraryCallName = getLibraryCallSymbolRef(op, rewriter);
+  if (!libraryCallName)
+    return failure();
+
+  rewriter.replaceOpWithNewOp<mlir::CallOp>(
+      op, libraryCallName.getValue(), TypeRange(),
+      createTypeCanonicalizedMemRefOperands(rewriter, op.getLoc(), {in, out}));
+  return success();
+}
+
 /// Populate the given list with patterns that convert from Linalg to Standard.
 void mlir::linalg::populateLinalgToStandardConversionPatterns(
     RewritePatternSet &patterns) {
   // TODO: ConvOp conversion needs to export a descriptor with relevant
   // attribute values such as kernel striding and dilation.
-  patterns.add<LinalgOpToLibraryCallRewrite>(patterns.getContext());
+  // clang-format off
+  patterns.add<
+      CopyOpToLibraryCallRewrite,
+      CopyTransposeRewrite,
+      LinalgOpToLibraryCallRewrite>(patterns.getContext());
+  // clang-format on
 }
 
 namespace {

diff  --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 6d31f8515506..facf303fed15 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -372,6 +372,110 @@ class RegionBuilderHelper {
 
 } // namespace
 
+//===----------------------------------------------------------------------===//
+// CopyOp
+//===----------------------------------------------------------------------===//
+void CopyOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block) {
+  assert(block.getNumArguments() == 2 && "CopyOp regionBuilder expects 2 args");
+  b.create<linalg::YieldOp>(block.getArgument(0));
+}
+
+void CopyOp::build(OpBuilder &builder, OperationState &result, Value input,
+                   Value output, AffineMap inputPermutation,
+                   AffineMap outputPermutation,
+                   ArrayRef<NamedAttribute> namedAttrs) {
+  result.addOperands({input, output});
+  result.addAttributes(namedAttrs);
+  if (inputPermutation)
+    result.addAttribute("inputPermutation",
+                        AffineMapAttr::get(inputPermutation));
+  if (outputPermutation)
+    result.addAttribute("outputPermutation",
+                        AffineMapAttr::get(outputPermutation));
+  result.addRegion();
+  fillStructuredOpRegion<CopyOp>(builder, *result.regions.front(),
+                                 TypeRange{input.getType()},
+                                 TypeRange{output.getType()});
+}
+
+ParseResult parseCopyOpRegion(OpAsmParser &parser, Region &r, Type inputType,
+                              Type outputType) {
+  OpBuilder opBuilder(parser.getContext());
+  fillStructuredOpRegion<CopyOp>(opBuilder, r, TypeRange{inputType},
+                                 TypeRange{outputType});
+  return success();
+}
+
+/// CopyOp region is elided when printing.
+void printCopyOpRegion(OpAsmPrinter &, Operation *, Region &, Type, Type) {}
+
+static LogicalResult verify(CopyOp op) {
+  OpOperand *output = op.getOutputOperand(0);
+  OpOperand *input = op.getInputOperand(0);
+  if (getElementTypeOrSelf(input->get()) != getElementTypeOrSelf(output->get()))
+    return op.emitOpError("expects views of the same type");
+  if (op.getRank(input) != op.getRank(output))
+    return op.emitOpError("expects views of the same rank");
+  auto rank = op.getNumParallelLoops();
+  auto inputPermutationMap = op.inputPermutation();
+  if (inputPermutationMap) {
+    if (inputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional input_permutation map of rank ")
+             << rank;
+    if (!inputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional input_permutation map to be a permutation");
+  }
+  auto outputPermutationMap = op.outputPermutation();
+  if (outputPermutationMap) {
+    if (outputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional output_permutation map of rank ")
+             << rank;
+    if (!outputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional output_permutation map to be a permutation");
+  }
+  if (rank == 0 && inputPermutationMap)
+    return op.emitOpError("expected no input permutation when rank == 0");
+  if (rank == 0 && outputPermutationMap)
+    return op.emitOpError("expected no output permutation when rank == 0");
+  return success();
+}
+
+void CopyOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get(), input(),
+                       SideEffects::DefaultResource::get());
+  effects.emplace_back(MemoryEffects::Write::get(), output(),
+                       SideEffects::DefaultResource::get());
+}
+
+namespace {
+/// Remove copy operations that copy data inplace. Requirements are:
+/// 1) The input and output values are identical.
+/// 2) The input and output permutation maps are identical.
+struct EraseIdentityCopyOp : public OpRewritePattern<CopyOp> {
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    assert(copyOp.hasBufferSemantics());
+    if (copyOp.input() == copyOp.output() &&
+        copyOp.inputPermutation() == copyOp.outputPermutation()) {
+      rewriter.eraseOp(copyOp);
+      return success();
+    }
+    return failure();
+  }
+};
+} // namespace
+
+void CopyOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                         MLIRContext *context) {
+  results.add<EraseIdentityCopyOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // FillOp
 //===----------------------------------------------------------------------===//
@@ -2061,6 +2165,7 @@ struct FoldTensorCastOp : public OpInterfaceRewritePattern<LinalgOp> {
     return foldMemRefCast(*this);                                              \
   }
 
+LINALGOP_FOLDERS(CopyOp)
 LINALGOP_FOLDERS(FillOp)
 LINALGOP_FOLDERS(GenericOp)
 

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
index f02cee53586a..0dab5a181980 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
@@ -31,7 +31,7 @@ static Value cloneMemref(Location loc, Value memref, OpBuilder &b) {
   auto memrefType = memref.getType().cast<MemRefType>();
   auto alloc = b.create<memref::AllocOp>(loc, memrefType,
                                          getDynOperands(loc, memref, b));
-  b.create<memref::CopyOp>(loc, memref, alloc);
+  b.create<linalg::CopyOp>(loc, memref, alloc);
   return alloc;
 }
 
@@ -197,10 +197,10 @@ class BufferizeAnyLinalgOp : public OpInterfaceConversionPattern<LinalgOp> {
 /// ```
 ///   %a = alloc(sizes)
 ///   %sv = subview %source [offsets][sizes][strides]
-///   memref.copy(%sv, %a)
+///   linalg_copy(%sv, %a)
 /// ```
 ///
-/// This pattern is arguable a std pattern once memref::CopyOp becomes
+/// This pattern is arguable a std pattern once linalg::CopyOp becomes
 /// std::CopyOp.
 class ExtractSliceOpConverter
     : public OpConversionPattern<tensor::ExtractSliceOp> {
@@ -223,7 +223,7 @@ class ExtractSliceOpConverter
     Value subView = rewriter.create<memref::SubViewOp>(
         op.getLoc(), sourceMemref, op.getMixedOffsets(), op.getMixedSizes(),
         op.getMixedStrides());
-    rewriter.create<memref::CopyOp>(op.getLoc(), subView, alloc);
+    rewriter.create<linalg::CopyOp>(op.getLoc(), subView, alloc);
     rewriter.replaceOp(op, alloc);
     return success();
   }
@@ -235,11 +235,11 @@ class ExtractSliceOpConverter
 /// conversion infra:
 /// ```
 ///   %sv = subview %dest [offsets][sizes][strides]
-///   memref.copy(%source, %sv)
+///   linalg_copy(%source, %sv)
 ///   // replace with %dest
 /// ```
 ///
-/// This pattern is arguable a std pattern once memref::CopyOp becomes
+/// This pattern is arguable a std pattern once linalg::CopyOp becomes
 /// std::CopyOp.
 class InsertSliceOpConverter
     : public OpConversionPattern<tensor::InsertSliceOp> {
@@ -263,7 +263,7 @@ class InsertSliceOpConverter
         op.getLoc(), destMemRef, op.getMixedOffsets(), op.getMixedSizes(),
         op.getMixedStrides());
     // Copy the small memref.
-    rewriter.create<memref::CopyOp>(op.getLoc(), sourceMemRef, subview);
+    rewriter.create<linalg::CopyOp>(op.getLoc(), sourceMemRef, subview);
     rewriter.replaceOp(op, destMemRef);
     return success();
   }

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
index 159dfe49d6b7..5bd12aa1c39f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -81,7 +81,7 @@ static FailureOr<Value> allocationFnUsingAlloca(OpBuilder &b, Location loc,
 /// Create a linalg::GenericOp version of an n-D copy that can further tile,
 /// lower to loops or vectorize, unlike the current implementation of
 /// memref::CopyOp.
-/// Do not depend on memref::CopyOp that is getting deprecated.
+/// Do not depend on linalg::CopyOp that is getting deprecated.
 static LogicalResult createLinalgCopyOp(OpBuilder &b, Location loc, Value from,
                                         Value to) {
   auto memrefTypeFrom = from.getType().cast<MemRefType>();

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
index 9eda19ae3f4b..bc53a719a474 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -637,7 +637,7 @@ static bool doesTransposeAccess(AffineMap map,
 ///     Fused dimensions : i, j
 ///
 /// Example 3:
-///   memref.copy(%s, %b)
+///   linalg.copy(%s, %b)
 ///   linalg.matmul ins(%a, %b) outs(%c)
 ///
 ///   Number of parallel loops = 2

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 12e517f62019..9963381715fc 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -186,7 +186,7 @@ LinalgOpInstancePromotionOptions::LinalgOpInstancePromotionOptions(
   Location loc = linalgOp.getLoc();
   auto defaultCopyCallBack = [loc](OpBuilder &b, Value src,
                                    Value dst) -> LogicalResult {
-    b.create<memref::CopyOp>(loc, src, dst);
+    b.create<linalg::CopyOp>(loc, src, dst);
     return success();
   };
   copyInFn = (options.copyInFn ? *(options.copyInFn) : defaultCopyCallBack);

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 7dcc9770de6c..17daa44516e4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -722,11 +722,6 @@ LogicalResult mlir::linalg::LinalgVectorizationPattern::matchAndRewrite(
   return vectorize(rewriter, linalgOp);
 }
 
-LogicalResult mlir::linalg::CopyVectorizationPattern::matchAndRewrite(
-    memref::CopyOp copyOp, PatternRewriter &rewriter) const {
-  return vectorizeCopy(rewriter, copyOp);
-}
-
 LogicalResult mlir::linalg::applyStagedPatterns(
     Operation *op, ArrayRef<FrozenRewritePatternSet> stage1Patterns,
     const FrozenRewritePatternSet &stage2Patterns,

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0d8632df6b0e..226a84300d8f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -656,38 +656,6 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter,
   return success();
 }
 
-LogicalResult mlir::linalg::vectorizeCopy(RewriterBase &rewriter,
-                                          memref::CopyOp copyOp) {
-
-  auto srcType = copyOp.source().getType().cast<MemRefType>();
-  auto dstType = copyOp.target().getType().cast<MemRefType>();
-  if (!srcType.hasStaticShape() || !dstType.hasStaticShape())
-    return failure();
-
-  auto readType =
-      VectorType::get(srcType.getShape(), getElementTypeOrSelf(srcType));
-  auto writeType =
-      VectorType::get(dstType.getShape(), getElementTypeOrSelf(dstType));
-
-  Location loc = copyOp->getLoc();
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  SmallVector<Value> indices(srcType.getRank(), zero);
-
-  Value readValue = rewriter.create<vector::TransferReadOp>(
-      loc, readType, copyOp.source(), indices,
-      rewriter.getMultiDimIdentityMap(srcType.getRank()));
-  if (readValue.getType().cast<VectorType>().getRank() == 0) {
-    readValue = rewriter.create<vector::ExtractElementOp>(loc, readValue);
-    readValue = rewriter.create<vector::BroadcastOp>(loc, writeType, readValue);
-  }
-  Operation *writeValue = rewriter.create<vector::TransferWriteOp>(
-      loc, readValue, copyOp.target(), indices,
-      rewriter.getMultiDimIdentityMap(srcType.getRank()));
-  copyOp->getParentOfType<FuncOp>().dump();
-  rewriter.replaceOp(copyOp, writeValue->getResults());
-  return success();
-}
-
 //----------------------------------------------------------------------------//
 // Misc. vectorization patterns.
 //----------------------------------------------------------------------------//
@@ -1200,11 +1168,11 @@ LogicalResult LinalgCopyVTRForwardingPattern::matchAndRewrite(
   LDBG("with subView " << subView);
 
   // Find the copy into `subView` without interleaved uses.
-  memref::CopyOp copyOp;
+  CopyOp copyOp;
   for (auto &u : subView.getUses()) {
-    if (auto newCopyOp = dyn_cast<memref::CopyOp>(u.getOwner())) {
-      assert(newCopyOp.target().getType().isa<MemRefType>());
-      if (newCopyOp.target() != subView)
+    if (auto newCopyOp = dyn_cast<CopyOp>(u.getOwner())) {
+      assert(newCopyOp.output().getType().isa<MemRefType>());
+      if (newCopyOp.output() != subView)
         continue;
       LDBG("copy candidate " << *newCopyOp);
       if (mayExistInterleavedUses(newCopyOp, xferOp, {viewOrAlloc, subView}))
@@ -1238,10 +1206,10 @@ LogicalResult LinalgCopyVTRForwardingPattern::matchAndRewrite(
   if (maybeFillOp)
     LDBG("with maybeFillOp " << *maybeFillOp);
 
-  // `in` is the subview that memref.copy reads. Replace it.
-  Value in = copyOp.source();
+  // `in` is the subview that linalg.copy reads. Replace it.
+  Value in = copyOp.input();
 
-  // memref.copy + linalg.fill can be used to create a padded local buffer.
+  // linalg.copy + linalg.fill can be used to create a padded local buffer.
   // The `masked` attribute is only valid on this padded buffer.
   // When forwarding to vector.transfer_read, the attribute must be reset
   // conservatively.
@@ -1280,10 +1248,10 @@ LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite(
   Value subView = subViewOp.getResult();
 
   // Find the copy from `subView` without interleaved uses.
-  memref::CopyOp copyOp;
+  CopyOp copyOp;
   for (auto &u : subViewOp.getResult().getUses()) {
-    if (auto newCopyOp = dyn_cast<memref::CopyOp>(u.getOwner())) {
-      if (newCopyOp.source() != subView)
+    if (auto newCopyOp = dyn_cast<CopyOp>(u.getOwner())) {
+      if (newCopyOp.getInputOperand(0)->get() != subView)
         continue;
       if (mayExistInterleavedUses(xferOp, newCopyOp, {viewOrAlloc, subView}))
         continue;
@@ -1295,11 +1263,11 @@ LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite(
     return failure();
 
   // `out` is the subview copied into that we replace.
-  assert(copyOp.target().getType().isa<MemRefType>());
-  Value out = copyOp.target();
+  assert(copyOp.output().getType().isa<MemRefType>());
+  Value out = copyOp.output();
 
   // Forward vector.transfer into copy.
-  // memref.copy + linalg.fill can be used to create a padded local buffer.
+  // linalg.copy + linalg.fill can be used to create a padded local buffer.
   // The `masked` attribute is only valid on this padded buffer.
   // When forwarding to vector.transfer_write, the attribute must be reset
   // conservatively.

diff  --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index ff3a6012f2d5..f0c06469ed67 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -270,7 +270,7 @@ createFullPartialLinalgCopy(RewriterBase &b, vector::TransferReadOp xferOp,
         std::pair<Value, Value> copyArgs = createSubViewIntersection(
             rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
             alloc);
-        b.create<memref::CopyOp>(loc, copyArgs.first, copyArgs.second);
+        b.create<linalg::CopyOp>(loc, copyArgs.first, copyArgs.second);
         Value casted =
             b.create<memref::CastOp>(loc, alloc, compatibleMemRefType);
         scf::ValueVector viewAndIndices{casted};
@@ -403,7 +403,7 @@ static void createFullPartialLinalgCopy(RewriterBase &b,
     std::pair<Value, Value> copyArgs = createSubViewIntersection(
         rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
         alloc);
-    b.create<memref::CopyOp>(loc, copyArgs.first, copyArgs.second);
+    b.create<linalg::CopyOp>(loc, copyArgs.first, copyArgs.second);
     b.create<scf::YieldOp>(loc, ValueRange{});
   });
 }

diff  --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index 3fb7d49c04f4..80bd5f8363e3 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -143,7 +143,7 @@ func @dynamic_results(%arg0: tensor<?x?xf32>)
 // CHECK-DAG:           %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0_TENSOR]] : memref<2x3x4xvector<3x4xi4>>
 // CHECK-DAG:           %[[ARG1_MEMREF:.*]] = bufferization.to_memref %[[ARG1_TENSOR]] : memref<3x2xf32>
 // CHECK:           %[[INIT_BUFFER:.*]] = memref.alloc() : memref<3x2xf32>
-// CHECK:           memref.copy %[[ARG1_MEMREF]], %[[INIT_BUFFER]] : memref<3x2xf32> to memref<3x2xf32>
+// CHECK:           linalg.copy(%[[ARG1_MEMREF]], %[[INIT_BUFFER]]) : memref<3x2xf32>, memref<3x2xf32>
 // CHECK:           linalg.generic
 // CHECK-SAME:      ins(%[[ARG0_MEMREF]] : memref<2x3x4xvector<3x4xi4>>)
 // CHECK-SAME:      outs(%[[INIT_BUFFER]] : memref<3x2xf32>) {
@@ -178,14 +178,14 @@ func @bufferize_slice(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?xf32>
   // CHECK-NEXT: %[[A0:.*]] = memref.alloc() : memref<2x3xf32>
   // CHECK-NEXT: %[[SM0:.*]] = memref.subview %[[M]][0, 0] [2, 3] [1, 1]
   // CHECK-SAME:   memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
-  // CHECK-NEXT: memref.copy %[[SM0]], %[[A0]] : memref<2x3xf32, #[[$MAP0]]> to memref<2x3xf32>
+  // CHECK-NEXT: linalg.copy(%[[SM0]], %[[A0]]) : memref<2x3xf32, #[[$MAP0]]>, memref<2x3xf32>
   // CHECK-NEXT: %[[RT0:.*]] = bufferization.to_tensor %[[A0]] : memref<2x3xf32>
   %st0 = tensor.extract_slice %t[0, 0][2, 3][1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
 
   // CHECK-NEXT: %[[A1:.*]] = memref.alloc(%[[IDX]]) : memref<2x?xf32>
   // CHECK-NEXT: %[[SM1:.*]] = memref.subview %[[M]][0, %[[IDX]]] [2, %[[IDX]]] [1, 2]
   // CHECK-SAME:   memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
-  // CHECK-NEXT: memref.copy %[[SM1]], %[[A1]] : memref<2x?xf32, #[[$MAP1]]> to memref<2x?xf32>
+  // CHECK-NEXT: linalg.copy(%[[SM1]], %[[A1]]) : memref<2x?xf32, #[[$MAP1]]>, memref<2x?xf32>
   // CHECK-NEXT: %[[RT1:.*]] = bufferization.to_tensor %[[A1]] : memref<2x?xf32>
   %st1 = tensor.extract_slice %t[0, %i0][2, %i0][1, 2] : tensor<?x?xf32> to tensor<2x?xf32>
 
@@ -221,18 +221,18 @@ func @bufferize_insert_slice(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %st1
   // CHECK-NEXT: %[[DIM0:.*]] = tensor.dim %[[T]], %[[C0]] : tensor<?x?xf32>
   // CHECK-NEXT: %[[DIM1:.*]] = tensor.dim %[[T]], %[[C1]] : tensor<?x?xf32>
   // CHECK-NEXT: %[[M_COPY0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
-  // CHECK-NEXT: memref.copy %[[M]], %[[M_COPY0]] : memref<?x?xf32> to memref<?x?xf32>
+  // CHECK-NEXT: linalg.copy(%[[M]], %[[M_COPY0]]) : memref<?x?xf32>, memref<?x?xf32>
   // CHECK-NEXT: %[[SUBVIEW0:.*]] = memref.subview %[[M_COPY0]][0, 0] [2, 3] [1, 1]
   // CHECK-SAME:   memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
-  // CHECK-NEXT: memref.copy %[[SM0]], %[[SUBVIEW0]] : memref<2x3xf32> to memref<2x3xf32, #[[$MAP0]]>
+  // CHECK-NEXT: linalg.copy(%[[SM0]], %[[SUBVIEW0]]) : memref<2x3xf32>, memref<2x3xf32, #[[$MAP0]]>
   // CHECK-NEXT: %[[RT0:.*]] = bufferization.to_tensor %[[M_COPY0]] : memref<?x?xf32>
   %t0 = tensor.insert_slice %st0 into %t[0, 0][2, 3][1, 1] : tensor<2x3xf32> into tensor<?x?xf32>
 
   // CHECK-NEXT: %[[M_COPY1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
-  // CHECK-NEXT: memref.copy %[[M]], %[[M_COPY1]] : memref<?x?xf32> to memref<?x?xf32>
+  // CHECK-NEXT: linalg.copy(%[[M]], %[[M_COPY1]]) : memref<?x?xf32>, memref<?x?xf32>
   // CHECK-NEXT: %[[SUBVIEW1:.*]] = memref.subview %[[M_COPY1]][0, %[[IDX]]] [2, %[[IDX]]] [1, 2]
   // CHECK-SAME:   memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
-  // CHECK-NEXT: memref.copy %[[SM1]], %[[SUBVIEW1]] : memref<2x?xf32> to memref<2x?xf32, #[[$MAP1]]>
+  // CHECK-NEXT: linalg.copy(%[[SM1]], %[[SUBVIEW1]]) : memref<2x?xf32>, memref<2x?xf32, #[[$MAP1]]>
   // CHECK-NEXT: %[[RT1:.*]] = bufferization.to_tensor %[[M_COPY1]] : memref<?x?xf32>
   %t1 = tensor.insert_slice %st1 into %t[0, %i0][2, %i0][1, 2] : tensor<2x?xf32> into tensor<?x?xf32>
 
@@ -296,9 +296,9 @@ func @pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tens
 // CHECK:           %[[FILLED:.*]] = memref.alloc(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : memref<4x?x?x?xf32>
 // CHECK:           linalg.fill(%[[CST]], %[[FILLED]]) : f32, memref<4x?x?x?xf32>
 // CHECK:           %[[OUT:.*]] = memref.alloc(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : memref<4x?x?x?xf32>
-// CHECK:           memref.copy %[[FILLED]], %[[OUT]] : memref<4x?x?x?xf32> to memref<4x?x?x?xf32>
+// CHECK:           linalg.copy(%[[FILLED]], %[[OUT]]) : memref<4x?x?x?xf32>, memref<4x?x?x?xf32>
 // CHECK:           %[[INTERIOR:.*]] = memref.subview %[[OUT]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : memref<4x?x?x?xf32> to memref<4x?x2x?xf32, #map>
-// CHECK:           memref.copy %[[IN_MEMREF]], %[[INTERIOR]] : memref<4x?x2x?xf32> to memref<4x?x2x?xf32, #map>
+// CHECK:           linalg.copy(%[[IN_MEMREF]], %[[INTERIOR]]) : memref<4x?x2x?xf32>, memref<4x?x2x?xf32, #map>
 // CHECK:           %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[OUT]] : memref<4x?x?x?xf32>
 // CHECK:           return %[[OUT_TENSOR]] : tensor<4x?x?x?xf32>
 // CHECK:         }

diff  --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index b79fb56a0498..96d3aa26deaf 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -54,7 +54,7 @@ func @memref_cast_into_tiled_loop(%arg0: memref<192xf32>)  {
 
 func @dce_zero_memref(%arg0 : memref<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf32> {
   // memref<0x32> is expected to be dce'ed
-  memref.copy %arg0, %arg0 : memref<0xf32> to memref<0xf32>
+  linalg.copy(%arg0, %arg0): memref<0xf32>, memref<0xf32>
 
   // tensor<0xf32> cannot be dce'ed
   %1 = linalg.generic #trait outs(%arg1 : tensor<0xf32>) {
@@ -67,7 +67,7 @@ func @dce_zero_memref(%arg0 : memref<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf
 // CHECK-LABEL: @dce_zero_memref
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<0xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<0xf32>
-//   CHECK-NOT:   memref.copy
+//   CHECK-NOT:   linalg.copy
 //  CHECK-NEXT:   return %[[ARG1]]
 
 // -----
@@ -330,8 +330,22 @@ func @propogate_casts(%arg0 : tensor<?x?xf32>, %arg1 : f32, %arg2 : index,
 // CHECK-LABEL: @self_copy
 func @self_copy(%arg0 : memref<2x3x?x4xf32>) {
 
-//   CHECK-NOT: memref.copy
-  memref.copy %arg0, %arg0 : memref<2x3x?x4xf32> to memref<2x3x?x4xf32>
+//   CHECK-NOT: linalg.copy
+  linalg.copy(%arg0, %arg0): memref<2x3x?x4xf32>, memref<2x3x?x4xf32>
+
+//   CHECK: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @self_copy_with_permutation
+func @self_copy_with_permutation(%arg0 : memref<2x3x?x4xf32>) {
+
+//   CHECK: linalg.copy
+  linalg.copy(%arg0, %arg0)
+    {inputPermutation = affine_map<(i, j, k, l) -> (j, k, i, l)>,
+     outputPermuation = affine_map<(i, j, k, l) -> (i, j, k, l)>} : memref<2x3x?x4xf32>, memref<2x3x?x4xf32>
 
 //   CHECK: return
   return

diff  --git a/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
index b8e39196d5cc..a8ef32e632d8 100644
--- a/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
+++ b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: testAllocRead
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
 //   CHECK-NOT: linalg.fill
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_read %[[ARG0]]
 //   CHECK-NOT: in_bounds
@@ -12,7 +12,7 @@ func @testAllocRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %f0 = arith.constant 0.0: f32
   %alloc = memref.alloc() : memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
-  memref.copy %in, %subview : memref<? x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %alloc[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<32 x f32>
   return %0: vector<32 x f32>
@@ -21,7 +21,7 @@ func @testAllocRead(%in: memref<? x f32>) -> vector<32 x f32> {
 // CHECK-LABEL: testAllocFillRead
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
 //   CHECK-NOT: linalg.fill
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_read %[[ARG0]]
 //   CHECK-NOT: in_bounds
@@ -31,7 +31,7 @@ func @testAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %alloc = memref.alloc() : memref<32 x f32>
   linalg.fill(%f0, %alloc) : f32, memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
-  memref.copy %in, %subview : memref<? x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %alloc[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<32 x f32>
   return %0: vector<32 x f32>
@@ -40,7 +40,7 @@ func @testAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
 // CHECK-LABEL: testViewRead
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
 //   CHECK-NOT: linalg.fill
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_read %[[ARG0]]
 //   CHECK-NOT: in_bounds
@@ -50,7 +50,7 @@ func @testViewRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %alloc = memref.alloc() : memref<128 x i8>
   %view = memref.view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
   %subview = memref.subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
-  memref.copy %in, %subview : memref<? x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %view[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<128 x i8>
   return %0: vector<32 x f32>
@@ -59,7 +59,7 @@ func @testViewRead(%in: memref<? x f32>) -> vector<32 x f32> {
 // CHECK-LABEL: testViewFillRead
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
 //   CHECK-NOT: linalg.fill
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_read %[[ARG0]]
 //   CHECK-NOT: in_bounds
@@ -70,7 +70,7 @@ func @testViewFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %view = memref.view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
   %subview = memref.subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
   linalg.fill(%f0, %view) : f32, memref<32 x f32>
-  memref.copy %in, %subview : memref<? x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   %0 = vector.transfer_read %view[%c0], %f0 {in_bounds = [true]} : memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<128 x i8>
   return %0: vector<32 x f32>
@@ -79,7 +79,7 @@ func @testViewFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
 // CHECK-LABEL: testAllocWrite
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: vector
 //  CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_write %[[ARG0]], %[[ARG1]]
 //   CHECK-NOT: in_bounds
@@ -89,7 +89,7 @@ func @testAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
   %alloc = memref.alloc() : memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
   vector.transfer_write %vec, %alloc[%c0] {in_bounds = [true]} : vector<32 x f32>, memref<32 x f32>
-  memref.copy %subview, %out : memref<16 x f32> to memref<? x f32>
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
   memref.dealloc %alloc : memref<32 x f32>
   return
 }
@@ -97,7 +97,7 @@ func @testAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
 // CHECK-LABEL: testViewWrite
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: vector
 //  CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-//   CHECK-NOT: memref.copy
+//   CHECK-NOT: linalg.copy
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_write %[[ARG0]], %[[ARG1]]
 //   CHECK-NOT: in_bounds
@@ -108,7 +108,7 @@ func @testViewWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
   %view = memref.view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
   %subview = memref.subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
   vector.transfer_write %vec, %view[%c0] {in_bounds = [true]} : vector<32 x f32>, memref<32 x f32>
-  memref.copy %subview, %out : memref<16 x f32> to memref<? x f32>
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
   memref.dealloc %alloc : memref<128 x i8>
   return
 }
@@ -122,7 +122,7 @@ func @testViewWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
 //  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
 //   CHECK-NOT: vector.transfer_read %[[ARG0]]
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
-//       CHECK: memref.copy
+//       CHECK: linalg.copy
 //       CHECK: vector.transfer_read %[[ALLOC]]
 func @failAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %c0 = arith.constant 0: index
@@ -131,7 +131,7 @@ func @failAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
   %alloc = memref.alloc() : memref<32 x f32>
   linalg.fill(%f0, %alloc) : f32, memref<32 x f32>
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
-  memref.copy %in, %subview : memref<? x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
   "some_interleaved_use"(%subview) : (memref<16 x f32>) -> ()
   %0 = vector.transfer_read %alloc[%c0], %f1: memref<32 x f32>, vector<32 x f32>
   memref.dealloc %alloc : memref<32 x f32>
@@ -145,7 +145,7 @@ func @failAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
 //   CHECK-NOT: vector.transfer_write %[[ARG0]], %[[ARG1]]
 //       CHECK: %[[ALLOC:.*]] = memref.alloc
 //       CHECK: vector.transfer_write %[[ARG0]], %[[ALLOC]]
-//       CHECK: memref.copy
+//       CHECK: linalg.copy
 func @failAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
   %c0 = arith.constant 0: index
   %f0 = arith.constant 0.0: f32
@@ -153,7 +153,7 @@ func @failAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
   %subview = memref.subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
   vector.transfer_write %vec, %alloc[%c0] : vector<32 x f32>, memref<32 x f32>
   "some_interleaved_use"(%subview) : (memref<16 x f32>) -> ()
-  memref.copy %subview, %out : memref<16 x f32> to memref<? x f32>
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
   memref.dealloc %alloc : memref<32 x f32>
   return
 }

diff  --git a/mlir/test/Dialect/Linalg/fusion-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
index 38378ff615b8..73ef011aae40 100644
--- a/mlir/test/Dialect/Linalg/fusion-pattern.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
@@ -71,6 +71,165 @@ module {
 
 // -----
 
+module {
+  func @rhs_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                              %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    linalg.copy(%arg1, %arg2) : memref<?x?xf32>, memref<?x?xf32>
+    linalg.fill(%cst, %arg3) : f32, memref<?x?xf32>
+    linalg.matmul {__internal_linalg_transform__ = "rhs_fusion"}
+      ins(%arg0, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg3 : memref<?x?xf32>)
+    return
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//  CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 64, -d0 + s1)>
+//      CHECK: func @rhs_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+//  CHECK-DAG:   %[[CST:.+]] = arith.constant 0.0{{.*}} : f32
+//  CHECK-DAG:   linalg.copy(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME:   __internal_linalg_transform__ = "after_rhs_fusion_original"
+//  CHECK-DAG:   %[[N:.+]] = memref.dim %[[ARG2]], %[[C1]]
+//      CHECK:   scf.parallel (%[[IV0:.+]]) =
+// CHECK-SAME:     (%[[C0]]) to (%[[N]]) step (%[[C64]]) {
+//      CHECK:     %[[K:.+]] = memref.dim %[[ARG2]], %[[C0]]
+//      CHECK:     %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[N]]]
+//      CHECK:     %[[SV1:.+]] = memref.subview %[[ARG2]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[K]], %[[TILE_N]]]
+//      CHECK:     %[[M:.+]] = memref.dim %[[ARG3]], %[[C0]]
+//      CHECK:     %[[SV2:.+]] = memref.subview %[[ARG3]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[M]], %[[TILE_N]]
+//      CHECK:     %[[N_3:.+]] = memref.dim %[[ARG1]], %[[C1]]
+//      CHECK:     %[[K_2:.+]] = memref.dim %[[ARG1]], %[[C0]]
+//      CHECK:     %[[TILE_N_3:.+]] = affine.min #[[MAP4]](%[[IV0]])[%[[N_3]], %[[N]]]
+//      CHECK:     %[[SV3:.+]] = memref.subview %[[ARG1]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[K_2]], %[[TILE_N_3]]]
+//      CHECK:     %[[SV3_2:.+]] = memref.subview %[[ARG2]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[K]], %[[TILE_N_3]]]
+//      CHECK:     linalg.copy(%[[SV3]], %[[SV3_2]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_rhs_fusion_producer"
+//  CHECK-NOT:     linalg.fill
+//  CHECK-DAG:     %[[M_2:.+]] = memref.dim %[[ARG0]], %[[C0]]
+//  CHECK-DAG:     %[[K_2:.+]] = memref.dim %[[ARG0]], %[[C1]]
+//      CHECK:     scf.parallel (%[[IV1:.+]]) =
+// CHECK-SAME:       (%[[C0]]) to (%[[M_2]]) step (%[[C32]]) {
+// CHECK-NEXT:       scf.for %[[IV2:.+]] = %[[C0]] to %[[K_2]] step %[[C16]] {
+//      CHECK:         %[[TILE_M:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[M_2]]]
+//      CHECK:         %[[TILE_K:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K_2]]]
+//      CHECK:         %[[SV4:.+]] = memref.subview %[[ARG0]][%[[IV1]], %[[IV2]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:         %[[SV5:.+]] = memref.subview %[[SV1]][%[[IV2]], 0]
+// CHECK-SAME:           [%[[TILE_K]], %[[TILE_N]]]
+//      CHECK:         %[[SV6:.+]] = memref.subview %[[SV2]][%[[IV1]], 0]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_N]]]
+//      CHECK:         linalg.matmul
+// CHECK-SAME:           __internal_linalg_transform__ = "after_rhs_fusion"
+// CHECK-SAME:           ins(%[[SV4]], %[[SV5]]
+// CHECK-SAME:             : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:           outs(%[[SV6]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:       }
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_rhs_fusion_original"
+
+
+// -----
+
+module {
+  func @two_operand_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                              %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    linalg.copy(%arg0, %arg1) : memref<?x?xf32>, memref<?x?xf32>
+    linalg.fill(%cst, %arg3) : f32, memref<?x?xf32>
+    linalg.matmul {__internal_linalg_transform__ = "two_operand_fusion"}
+      ins(%arg1, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg3 : memref<?x?xf32>)
+    return
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//  CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 32, -d0 + s1)>
+//      CHECK: func @two_operand_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+//  CHECK-DAG:   %[[CST:.+]] = arith.constant 0.0{{.*}} : f32
+//      CHECK:   linalg.copy(%[[ARG0]], %[[ARG1]])
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+//      CHECK:   linalg.fill(%[[CST]], %[[ARG3]])
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+//  CHECK-DAG:   %[[M:.+]] = memref.dim %[[ARG1]], %[[C0]]
+//      CHECK:   scf.parallel (%[[IV0:.+]]) =
+// CHECK-SAME:     (%[[C0]]) to (%[[M]]) step (%[[C32]]) {
+//      CHECK:     %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]]
+//      CHECK:     %[[K:.+]] = memref.dim %[[ARG1]], %[[C1]]
+//      CHECK:     %[[SV1:.+]] = memref.subview %[[ARG1]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K]]]
+//      CHECK:     %[[N:.+]] = memref.dim %[[ARG3]], %[[C1]]
+//      CHECK:     %[[SV2:.+]] = memref.subview %[[ARG3]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[N]]]
+//      CHECK:     %[[M_2:.+]] = memref.dim %[[ARG3]], %[[C0]]
+//      CHECK:     %[[TILE_M_3:.+]] = affine.min #[[MAP4]](%[[IV0]])[%[[M_2]], %[[M]]]
+//      CHECK:     %[[SV2_2:.+]] = memref.subview %[[ARG3]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M_3]], %[[N]]]
+//      CHECK:     %[[M_3:.+]] = memref.dim %[[ARG0]], %[[C0]]
+//      CHECK:     %[[TILE_M_4:.+]] = affine.min #[[MAP4]](%[[IV0]])[%[[M_3]], %[[M]]]
+//      CHECK:     %[[K_3:.+]] = memref.dim %[[ARG0]], %[[C1]]
+//      CHECK:     %[[SV3:.+]] = memref.subview %[[ARG0]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M_4]], %[[K_3]]]
+//      CHECK:     %[[SV3_2:.+]] = memref.subview %[[ARG1]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M_4]], %[[K]]]
+//      CHECK:     linalg.copy(%[[SV3]], %[[SV3_2]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
+//      CHECK:     linalg.fill(%[[CST]], %[[SV2_2]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
+//  CHECK-DAG:     %[[N_2:.+]] = memref.dim %[[ARG2]], %[[C1]]
+//      CHECK:     scf.parallel (%[[IV1:.+]]) =
+// CHECK-SAME:       (%[[C0]]) to (%[[N_2]]) step (%[[C64]]) {
+// CHECK-NEXT:       scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] {
+//      CHECK:         %[[TILE_K:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[K]]]
+//      CHECK:         %[[SV4:.+]] = memref.subview %[[SV1]][0, %[[IV2]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:         %[[TILE_N:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N_2]]]
+//      CHECK:         %[[SV5:.+]] = memref.subview %[[ARG2]][%[[IV2]], %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_K]], %[[TILE_N]]]
+//      CHECK:         %[[SV6:.+]] = memref.subview %[[SV2]][0, %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_N]]]
+//      CHECK:         linalg.matmul
+// CHECK-SAME:           __internal_linalg_transform__ = "after_two_operand_fusion"
+// CHECK-SAME:           ins(%[[SV4]], %[[SV5]]
+// CHECK-SAME:             : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:           outs(%[[SV6]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:       }
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+
+// -----
+
 module {
   func @matmul_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                       %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>,

diff  --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
index 4a3767b58b0f..cd820e2a6187 100644
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -206,14 +206,7 @@ func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1:
 //       CHECKPARALLEL:     store %{{.*}}, {{.*}} : memref<?x?x?xf32, #[[$strided3D]]>
 
 func @copy_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>) {
-  linalg.generic {
-    iterator_types = ["parallel"],
-    indexing_maps = [ affine_map<(i) -> (i)>, affine_map<(i) -> (i)>] }
-    ins(%arg0: memref<?xf32, offset: ?, strides: [1]>)
-   outs(%arg1: memref<?xf32, offset: ?, strides : [1]>) {
-    ^bb0(%a: f32, %b: f32):
-      linalg.yield %a : f32
-  }
+  linalg.copy(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
   return
 }
 // CHECK-LABEL: func @copy_view(
@@ -228,6 +221,38 @@ func @copy_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf
 //       CHECKPARALLEL:     %[[L:.*]] = memref.load %{{.*}}[%{{.*}}] : memref<?xf32, #[[$strided1D]]>
 //       CHECKPARALLEL:     store %[[L]], %{{.*}}[%{{.*}}] : memref<?xf32, #[[$strided1D]]>
 
+func @copy_view0(%arg0: memref<f32>, %arg1: memref<f32>) {
+  linalg.copy(%arg0, %arg1) : memref<f32>, memref<f32>
+  return
+}
+// CHECK-LABEL: func @copy_view0(%{{.*}}: memref<f32>, %{{.*}}: memref<f32>) {
+//       CHECK:   memref.load %{{.*}}[] : memref<f32>
+//       CHECK:   store %{{.*}}, %{{.*}}[] : memref<f32>
+
+// CHECKPARALLEL-LABEL: func @copy_view0(%{{.*}}: memref<f32>, %{{.*}}: memref<f32>) {
+//       CHECKPARALLEL:   memref.load %{{.*}}[] : memref<f32>
+//       CHECKPARALLEL:   store %{{.*}}, %{{.*}}[] : memref<f32>
+
+func @copy_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = affine_map<(i, j, k) -> (i, k, j)>,
+                             outputPermutation = affine_map<(i, j, k) -> (k, j, i)>} :
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy_view3
+//       CHECK: (%{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>) {
+//       CHECK:   scf.for {{.*}} to %{{.*}}
+//       CHECK:     scf.for {{.*}} to %{{.*}}
+//       CHECK:       scf.for {{.*}} to %{{.*}}
+//       CHECK:         %[[L:.*]] = memref.load {{.*}} : memref<?x?x?xf32, #[[$strided3D]]>
+//       CHECK:         store %[[L]], {{.*}} : memref<?x?x?xf32, #[[$strided3D]]>
+
+// CHECKPARALLEL-LABEL: func @copy_view3
+//       CHECKPARALLEL: (%{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>) {
+//       CHECKPARALLEL:   scf.parallel (%{{.*}}, %{{.*}}, %{{.*}}) = (%{{.*}}, %{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}, %{{.*}}) {
+//       CHECKPARALLEL:     %[[L:.*]] = memref.load {{.*}} : memref<?x?x?xf32, #[[$strided3D]]>
+//       CHECKPARALLEL:     store %[[L]], {{.*}} : memref<?x?x?xf32, #[[$strided3D]]>
+
 #accesses = [
   affine_map<(i, j, k) -> (i, j)>,
   affine_map<(i, j, k) -> (i, j, k)>,
@@ -832,14 +857,8 @@ func @lower_to_loops_with_rank_reducing_subviews(
       : memref<?xi32> to memref<?xi32, offset: ?, strides: [1]>
   %1 = memref.subview %arg1[0, %arg4] [1, %arg3] [1, 1]
       : memref<?x?xi32> to memref<?xi32, offset: ?, strides : [1]>
-  linalg.generic {
-    iterator_types = ["parallel"],
-    indexing_maps = [affine_map<(i) -> (i)>, affine_map<(i) -> (i)>]}
-    ins(%0: memref<?xi32, offset: ?, strides: [1]>)
-   outs(%1: memref<?xi32, offset: ?, strides : [1]>) {
-    ^bb0(%a: i32, %b: i32):
-      linalg.yield %a : i32
-  }
+  linalg.copy(%0, %1)
+      : memref<?xi32, offset: ?, strides: [1]>, memref<?xi32, offset: ?, strides: [1]>
   return
 }
 // CHECK-LABEL: func @lower_to_loops_with_rank_reducing_subviews

diff  --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
index 98e6b069e475..1a0074d7c64a 100644
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -62,14 +62,14 @@ func @matmul_f32(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //     DYNAMIC:         memref.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?x?xf32>
 //       CHECK:         %[[partialC:.*]] = memref.subview %[[fullC]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, #[[$strided2D]]>
 
-//       CHECK:         emref.copy %[[vA]], %[[partialA]] : memref<?x?xf32, #[[$strided2D]]> to memref<?x?xf32, #[[$strided2D]]>
-//       CHECK:         memref.copy %[[vB]], %[[partialB]] : memref<?x?xf32, #[[$strided2D]]> to memref<?x?xf32, #[[$strided2D]]>
-//       CHECK:         memref.copy %[[vC]], %[[partialC]] : memref<?x?xf32, #[[$strided2D]]> to memref<?x?xf32, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vA]], %[[partialA]]) : memref<?x?xf32, #[[$strided2D]]>, memref<?x?xf32, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vB]], %[[partialB]]) : memref<?x?xf32, #[[$strided2D]]>, memref<?x?xf32, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vC]], %[[partialC]]) : memref<?x?xf32, #[[$strided2D]]>, memref<?x?xf32, #[[$strided2D]]>
 //
 //       CHECK:         linalg.matmul ins(%[[partialA]], %[[partialB]]{{.*}} outs(%[[partialC]]
 //
-//       CHECK:         memref.copy %[[partialC]], %[[vC]] :
-//       CHECK:           memref<?x?xf32, #[[$strided2D]]> to
+//       CHECK:         linalg.copy(%[[partialC]], %[[vC]]) :
+//       CHECK:           memref<?x?xf32, #[[$strided2D]]>,
 //       CHECK:           memref<?x?xf32, #[[$strided2D]]>
 //
 //       CHECK:         memref.dealloc %[[tmpA]] : memref<32xi8>
@@ -132,14 +132,14 @@ func @matmul_f64(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //     DYNAMIC:         memref.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?x?xf64>
 //       CHECK:         %[[partialC_f64:.*]] = memref.subview %[[fullC_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, #[[$strided2D]]>
 
-//       CHECK:         memref.copy %[[vA_f64]], %[[partialA_f64]] : memref<?x?xf64, #[[$strided2D]]> to memref<?x?xf64, #[[$strided2D]]>
-//       CHECK:         memref.copy %[[vB_f64]], %[[partialB_f64]] : memref<?x?xf64, #[[$strided2D]]> to memref<?x?xf64, #[[$strided2D]]>
-//       CHECK:         memref.copy %[[vC_f64]], %[[partialC_f64]] : memref<?x?xf64, #[[$strided2D]]> to memref<?x?xf64, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vA_f64]], %[[partialA_f64]]) : memref<?x?xf64, #[[$strided2D]]>, memref<?x?xf64, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vB_f64]], %[[partialB_f64]]) : memref<?x?xf64, #[[$strided2D]]>, memref<?x?xf64, #[[$strided2D]]>
+//       CHECK:         linalg.copy(%[[vC_f64]], %[[partialC_f64]]) : memref<?x?xf64, #[[$strided2D]]>, memref<?x?xf64, #[[$strided2D]]>
 //
 //       CHECK:         linalg.matmul ins(%[[partialA_f64]], %[[partialB_f64]]{{.*}} outs(%[[partialC_f64]]
 //
-//       CHECK:         memref.copy %[[partialC_f64]], %[[vC_f64]] :
-//       CHECK:           memref<?x?xf64, #[[$strided2D]]> to
+//       CHECK:         linalg.copy(%[[partialC_f64]], %[[vC_f64]]) :
+//       CHECK:           memref<?x?xf64, #[[$strided2D]]>,
 //       CHECK:           memref<?x?xf64, #[[$strided2D]]>
 //
 //       CHECK:         memref.dealloc %[[tmpA_f64]] : memref<64xi8>

diff  --git a/mlir/test/Dialect/Linalg/promotion_options.mlir b/mlir/test/Dialect/Linalg/promotion_options.mlir
index 9f0679f5d433..17317df59329 100644
--- a/mlir/test/Dialect/Linalg/promotion_options.mlir
+++ b/mlir/test/Dialect/Linalg/promotion_options.mlir
@@ -24,11 +24,11 @@ func @gemm(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 //      CHECK:       %[[T20:.+]] = memref.alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 3>
 //      CHECK:       %[[T21:.+]] = memref.subview %[[T20]]
 //      CHECK:       linalg.fill(%[[C42]], %[[T19]])
-//      CHECK:       memref.copy %[[T7]], %[[T19]]
+//      CHECK:       linalg.copy(%[[T7]], %[[T19]])
 //      CHECK:       linalg.fill(%[[C42]], %[[T21]])
-//      CHECK:       memref.copy %[[T17]], %[[T21]]
+//      CHECK:       linalg.copy(%[[T17]], %[[T21]])
 //      CHECK:       linalg.matmul ins(%[[T19]], %[[T12]]{{.*}} outs(%[[T21]]
 //  CHECK-NOT:       linalg.fill
-//      CHECK:       memref.copy %[[T21]], %[[T17]]
+//      CHECK:       linalg.copy(%[[T21]], %[[T17]])
 //      CHECK:       memref.dealloc %[[T18]]
 //      CHECK:       memref.dealloc %[[T20]]

diff  --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index 3c9e338b91af..337b7c0ad2b7 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -8,6 +8,8 @@
 
 // CHECK-DAG: #[[$id_2d:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 // CHECK-DAG: #[[$id_1d:.*]] = affine_map<(d0, d1, d2) -> (d1)>
+// CHECK-DAG: #[[$permute_0:.*]] = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
+// CHECK-DAG: #[[$permute_1:.*]] = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
 // CHECK-DAG: #[[$strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // CHECK-DAG: #[[$strided2D:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
 // CHECK-DAG: #[[$strided3D:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
@@ -95,6 +97,37 @@ func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1:
 
 // -----
 
+
+func @copy_view(%arg0: memref<?xf32, offset: ?, strides: [1]>,
+                %arg1: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.copy(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>,
+                              memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @copy_view(
+//       CHECK:   linalg.copy(%{{.*}}, %{{.*}}) :
+//  CHECK-SAME:     memref<?xf32, #[[$strided1D]]>, memref<?xf32, #[[$strided1D]]>
+
+// -----
+
+
+func @copy_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>,
+                 %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = affine_map<(i, j, k) -> (i, k, j)>,
+                             outputPermutation = affine_map<(i, j, k) -> (k, j, i)>} :
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy_view3(
+//       CHECK:  %{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[$strided3D]]>) {
+//       CHECK:   linalg.copy(%{{.*}}, %{{.*}}) {
+//  CHECK-SAME:     inputPermutation = #[[$permute_0]],
+//  CHECK-SAME:     outputPermutation = #[[$permute_1]]} :
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$strided3D]]>,
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$strided3D]]>
+
+// -----
+
 #accesses_0 = [
   affine_map<(i, j, k) -> (j, i)>,
   affine_map<(i, j, k) -> ()>,

diff  --git a/mlir/test/Dialect/Linalg/standard.mlir b/mlir/test/Dialect/Linalg/standard.mlir
index 83544f69caf3..246f7c39d2ec 100644
--- a/mlir/test/Dialect/Linalg/standard.mlir
+++ b/mlir/test/Dialect/Linalg/standard.mlir
@@ -1,8 +1,12 @@
 // RUN: mlir-opt %s -convert-linalg-to-std | FileCheck %s
 
 // CHECK-DAG: #[[$map0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
+// CHECK-DAG: #[[$map2:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d2 * s2 + d1)>
+// CHECK-DAG: #[[$map4:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2 * s1 + s0 + d1 * s2 + d0)>
 // CHECK-DAG: #[[$map6:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 // CHECK-DAG: #[[$map7:.*]] = affine_map<()[s0] -> (s0)>
+// CHECK-DAG: #[[$map8:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>
 
 func @dot(%arg0: memref<?xf32, offset: ?, strides: [1]>,
           %arg1: memref<?xf32, offset: ?, strides: [1]>,
@@ -26,6 +30,40 @@ func @dot(%arg0: memref<?xf32, offset: ?, strides: [1]>,
 //  CHECK-SAME:     %[[o0]], %[[o1]], %[[o2]]) :
 //  CHECK-SAME:   memref<?xf32, #[[$map6]]>, memref<?xf32, #[[$map6]]>, memref<f32, #[[$map7]]>
 
+func @copy(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy(
+//  CHECK-SAME: %[[arg0:[a-zA-z0-9]*]]: memref<?x?x?xf32, #[[$map1]]>,
+//  CHECK-SAME: %[[arg1:[a-zA-z0-9]*]]: memref<?x?x?xf32, #[[$map1]]>) {
+//       CHECK:   %[[o0:.*]] = memref.cast %[[arg0]] :
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$map1]]> to memref<?x?x?xf32, #[[$map8]]>
+//       CHECK:   %[[o1:.*]] = memref.cast %[[arg1]] :
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$map1]]> to memref<?x?x?xf32, #[[$map8]]>
+//       CHECK:   call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32(%[[o0]], %[[o1]]) :
+//  CHECK-SAME:   memref<?x?x?xf32, #[[$map8]]>, memref<?x?x?xf32, #[[$map8]]>
+
+func @copy_transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = affine_map<(i, j, k) -> (i, k, j)>,
+                             outputPermutation = affine_map<(i, j, k) -> (k, j, i)>}
+    : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy_transpose(
+//  CHECK-SAME: %[[arg0:[a-zA-z0-9]*]]: memref<?x?x?xf32, #[[$map1]]>,
+//  CHECK-SAME: %[[arg1:[a-zA-z0-9]*]]: memref<?x?x?xf32, #[[$map1]]>) {
+//       CHECK:   %[[t0:.*]] = memref.transpose %[[arg0]]
+//  CHECK-SAME:     (d0, d1, d2) -> (d0, d2, d1) : memref<?x?x?xf32, #[[$map1]]>
+//       CHECK:   %[[t1:.*]] = memref.transpose %[[arg1]]
+//  CHECK-SAME:     (d0, d1, d2) -> (d2, d1, d0) : memref<?x?x?xf32, #[[$map1]]>
+//       CHECK:   %[[o0:.*]] = memref.cast %[[t0]] :
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$map2]]> to memref<?x?x?xf32, #[[$map8]]>
+//       CHECK:   %[[o1:.*]] = memref.cast %[[t1]] :
+//  CHECK-SAME:     memref<?x?x?xf32, #[[$map4]]> to memref<?x?x?xf32, #[[$map8]]>
+//       CHECK:   call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32(%[[o0]], %[[o1]]) :
+//  CHECK-SAME:   memref<?x?x?xf32, #[[$map8]]>, memref<?x?x?xf32, #[[$map8]]>
+
 #matmul_accesses = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,

diff  --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
new file mode 100644
index 000000000000..3b3e64d5f59c
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -check-prefix=CHECK-1D
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s -check-prefix=CHECK-2D
+
+func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
+                  %B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
+                  %C: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>) {
+  linalg.matmul {__internal_linalg_transform__ = "START"}
+    ins(%A, %B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
+                memref<1584x1584xf32, offset: 0, strides: [1584, 1]>)
+   outs(%C: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>)
+  return
+}
+
+// CHECK-1D-LABEL:func @matmul
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
+//
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
+//
+//      CHECK-1D: vector.contract
+// CHECK-1D-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK-1D-SAME:   : vector<8x16xf32>, vector<16x12xf32> into vector<8x12xf32>
+//
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32>, vector<8x12xf32>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
+
+// CHECK-2D-LABEL:func @matmul
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
+//      CHECK-2D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
+//
+//      CHECK-2D: linalg.copy
+//      CHECK-2D: linalg.copy
+//      CHECK-2D: linalg.copy
+//
+//      CHECK-2D: vector.contract
+// CHECK-2D-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK-2D-SAME:   : vector<8x16xf32>, vector<16x12xf32> into vector<8x12xf32>
+//
+//      CHECK-2D: linalg.copy

diff  --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index a16b4bd85f1c..2171bf3f64bb 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -229,9 +229,9 @@ func @promote_subview_matmul(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>,
 // CHECK:               %[[v2:.*]] = memref.view %[[a2]]{{.*}} : memref<24000000xi8> to memref<?x?xf32>
 // CHECK:               %[[l2:.*]] = memref.subview %[[v2]][0, 0] [%{{.*}}, %{{.*}}] [1, 1]
 // CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, #[[$STRIDED_2D_u_1]]>
-// CHECK:               memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
-// CHECK:               memref.copy %[[s1]], %[[l1]] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
-// CHECK:               memref.copy %[[s2]], %[[l2]] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
+// CHECK:               linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK:               linalg.copy(%[[s1]], %[[l1]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK:               linalg.copy(%[[s2]], %[[l2]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
 // CHECK:               linalg.matmul
 // CHECK-SAME:                 ins(%[[v0]], %[[v1]] : memref<?x?xf32>, memref<?x?xf32>)
 // CHECK-SAME:                outs(%[[v2]] : memref<?x?xf32>)
@@ -282,8 +282,8 @@ func @promote_first_subview_matmul(%arg0: memref<?x?xf32, offset: ?, strides: [?
 // CHECK-NOT:     memref.alloc
 // CHECK-NOT:     memref.view
 // CHECK-NOT:     memref.subview
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
-// CHECK-NOT:     memref.copy
+// CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK-NOT:     linalg.copy
 // CHECK:         linalg.matmul
 // CHECK-SAME:           ins(%[[v0]], %[[s1]] : memref<?x?xf32>, memref<?x?xf32, #[[$STRIDED_2D]]>)
 // CHECK-SAME:          outs(%[[s2]] : memref<?x?xf32, #[[$STRIDED_2D]]>)
@@ -307,7 +307,7 @@ func @aligned_promote_fill(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, #[[$STRIDED_2D_u_1]]>
 // CHECK:         linalg.fill({{.*}}, %[[v0]]) : f32, memref<?x?xf32>
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
+// CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
 // CHECK:         linalg.fill(%[[cf]], %[[v0]]) : f32, memref<?x?xf32>
 
 func @aligned_promote_fill_complex(%arg0: memref<?x?xcomplex<f32>, offset: ?, strides: [?, 1]>) {
@@ -330,7 +330,7 @@ func @aligned_promote_fill_complex(%arg0: memref<?x?xcomplex<f32>, offset: ?, st
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<64000000xi8> to memref<?x?xcomplex<f32>>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xcomplex<f32>> to memref<?x?xcomplex<f32>, #[[$STRIDED_2D_u_1]]>
 // CHECK:         linalg.fill({{.*}}, %[[v0]]) : complex<f32>, memref<?x?xcomplex<f32>>
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xcomplex<f32>, #map{{.*}}> to memref<?x?xcomplex<f32>, #map{{.*}}>
+// CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xcomplex<f32>, #map{{.*}}>, memref<?x?xcomplex<f32>, #map{{.*}}>
 // CHECK:         linalg.fill(%[[cc]], %[[v0]]) : complex<f32>, memref<?x?xcomplex<f32>>
 
 func @tile_permute_parallel_loop(%arg0: memref<?x?xf32>,

diff  --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 45a6da2806c7..c9f50af28ef2 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -212,7 +212,7 @@ func @test_vectorize_fill_scalar(%A : memref<f32>, %arg0 : f32) {
 func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
   //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
   //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
+  linalg.copy(%A, %B) :  memref<8x16xf32>, memref<8x16xf32>
   return
 }
 
@@ -225,7 +225,7 @@ func @test_vectorize_copy_scalar(%A : memref<f32>, %B : memref<f32>) {
   //       CHECK:   %[[val:.*]] = vector.extractelement %[[V]][] : vector<f32>
   //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
   //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
-  memref.copy %A, %B :  memref<f32> to memref<f32>
+  linalg.copy(%A, %B) :  memref<f32>, memref<f32>
   return
 }
 
@@ -462,7 +462,7 @@ func @generic_vectorize_broadcast_transpose(
   iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
   ins(%B, %A, %A, %B: memref<4x4xf32>, memref<4xf32>, memref<4xf32>, memref<4x4xf32>)
   outs(%C : memref<4x4x4x4xf32>) {
-  ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
+  ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):  
     %s = arith.subf %arg0, %arg1 : f32
     %a = arith.addf %arg2, %s : f32
     %b = arith.addf %arg3, %a : f32
@@ -775,7 +775,7 @@ func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>)
       ],
       iterator_types = ["parallel", "parallel", "reduction"]
     } ins(%input : tensor<4x16x8xf32>) outs(%output : tensor<4x16xf32>) {
-    ^bb0(%arg0: f32, %arg1: f32):
+    ^bb0(%arg0: f32, %arg1: f32):  
       %1 = math.exp %arg0 : f32
       %2 = arith.addf %1, %arg1 : f32
       linalg.yield %2 : f32
@@ -811,7 +811,7 @@ func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: ten
       ],
       iterator_types = ["parallel", "reduction", "reduction", "parallel"]
     } ins(%input, %input_2 : tensor<3x2xf32>, tensor<5x4xf32>) outs(%output : tensor<5x2xf32>) {
-    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):  
       %1 = math.exp %arg0 : f32
       %2 = math.exp %arg1 : f32
       %3 = arith.addf %1, %2 : f32
@@ -838,7 +838,7 @@ func @red_max_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
-  ^bb0(%in0: f32, %out0: f32):
+  ^bb0(%in0: f32, %out0: f32):  
     %max = arith.maxf %in0, %out0 : f32
     linalg.yield %max : f32
   } -> tensor<4xf32>
@@ -863,7 +863,7 @@ func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
-  ^bb0(%in0: f32, %out0: f32):
+  ^bb0(%in0: f32, %out0: f32):  
     %min = arith.minf %out0, %in0 : f32
     linalg.yield %min : f32
   } -> tensor<4xf32>
@@ -887,7 +887,7 @@ func @red_mul_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
-  ^bb0(%in0: f32, %out0: f32):
+  ^bb0(%in0: f32, %out0: f32):  
     %mul = arith.mulf %in0, %out0 : f32
     linalg.yield %mul : f32
   } -> tensor<4xf32>
@@ -910,7 +910,7 @@ func @red_or_2d(%arg0: tensor<4x4xi1>) -> tensor<4xi1> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xi1>) outs(%fill : tensor<4xi1>) {
-  ^bb0(%in0: i1, %out0: i1):
+  ^bb0(%in0: i1, %out0: i1):  
     %or = arith.ori %in0, %out0 : i1
     linalg.yield %or : i1
   } -> tensor<4xi1>
@@ -933,7 +933,7 @@ func @red_and_2d(%arg0: tensor<4x4xi1>) -> tensor<4xi1> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xi1>) outs(%fill : tensor<4xi1>) {
-  ^bb0(%in0: i1, %out0: i1):
+  ^bb0(%in0: i1, %out0: i1):  
     %and = arith.andi %in0, %out0 : i1
     linalg.yield %and : i1
   } -> tensor<4xi1>
@@ -956,7 +956,7 @@ func @red_xor_2d(%arg0: tensor<4x4xi1>) -> tensor<4xi1> {
                                           affine_map<(d0, d1) -> (d0)>],
                          iterator_types = ["parallel", "reduction"]}
                          ins(%arg0 : tensor<4x4xi1>) outs(%fill : tensor<4xi1>) {
-  ^bb0(%in0: i1, %out0: i1):
+  ^bb0(%in0: i1, %out0: i1):  
     %xor = arith.xori %in0, %out0 : i1
     linalg.yield %xor : i1
   } -> tensor<4xi1>
@@ -1051,7 +1051,7 @@ func @reduce_1d(%arg0: tensor<32xf32>) -> tensor<f32> {
          iterator_types = ["reduction"]}
          ins(%arg0 : tensor<32xf32>)
          outs(%1 : tensor<f32>) {
-    ^bb0(%a: f32, %b: f32):
+    ^bb0(%a: f32, %b: f32):  
       %3 = arith.addf %a, %b : f32
       linalg.yield %3 : f32
     } -> tensor<f32>

diff  --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
index 9a10482e027a..2497c2623bdd 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -test-vector-transfer-full-partial-split -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-memref-copy -split-input-file | FileCheck %s --check-prefix=LINALG
+// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy -split-input-file | FileCheck %s --check-prefix=LINALG
 
 // CHECK-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)>
 // CHECK-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)>
@@ -82,7 +82,7 @@ func @split_vector_transfer_read_2d(%A: memref<?x8xf32>, %i: index, %j: index) -
   //      LINALG:   %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1]
   // LINALG-SAME:     memref<?x8xf32> to memref<?x?xf32, #[[$map_2d_stride_8x1]]>
   //      LINALG:   %[[alloc_view:.*]] = memref.subview %[[alloc]][0, 0] [%[[sv0]], %[[sv1]]] [1, 1]
-  //      LINALG:   memref.copy %[[sv]], %[[alloc_view]] : memref<?x?xf32, #[[$map_2d_stride_8x1]]> to memref<?x?xf32, #{{.*}}>
+  //      LINALG:   linalg.copy(%[[sv]], %[[alloc_view]]) : memref<?x?xf32, #[[$map_2d_stride_8x1]]>, memref<?x?xf32, #{{.*}}>
   //      LINALG:   %[[yielded:.*]] = memref.cast %[[alloc]] :
   // LINALG-SAME:     memref<4x8xf32> to memref<?x8xf32>
   //      LINALG:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
@@ -174,7 +174,7 @@ func @split_vector_transfer_read_strided_2d(
   //      LINALG:   %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1]
   // LINALG-SAME:     memref<7x8xf32, #[[$map_2d_stride_1]]> to memref<?x?xf32, #[[$map_2d_stride_1]]>
   //      LINALG:   %[[alloc_view:.*]] = memref.subview %[[alloc]][0, 0] [%[[sv0]], %[[sv1]]] [1, 1]
-  //      LINALG:   memref.copy %[[sv]], %[[alloc_view]] : memref<?x?xf32, #[[$map_2d_stride_1]]> to memref<?x?xf32, #{{.*}}>
+  //      LINALG:   linalg.copy(%[[sv]], %[[alloc_view]]) : memref<?x?xf32, #[[$map_2d_stride_1]]>, memref<?x?xf32, #{{.*}}>
   //      LINALG:   %[[yielded:.*]] = memref.cast %[[alloc]] :
   // LINALG-SAME:     memref<4x8xf32> to memref<?x8xf32, #[[$map_2d_stride_1]]>
   //      LINALG:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
@@ -279,8 +279,8 @@ func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf32>, %
 // LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
 // LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP4]]>
 // LINALG:             %[[DEST_VIEW:.*]] = memref.subview %[[DEST]][0, 0] [%[[VAL_20]], %[[VAL_21]]] [1, 1]
-// LINALG:             memref.copy %[[VAL_22]], %[[DEST_VIEW]]
-// LINALG-SAME:            : memref<?x?xf32, #[[MAP4]]> to memref<?x?xf32, #{{.*}}>
+// LINALG:             linalg.copy(%[[VAL_22]], %[[DEST_VIEW]])
+// LINALG-SAME:            : memref<?x?xf32, #[[MAP4]]>, memref<?x?xf32, #{{.*}}>
 // LINALG:           }
 // LINALG:           return
 // LINALG:         }
@@ -388,8 +388,8 @@ func @split_vector_transfer_write_strided_2d(
 // LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
 // LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP5]]>
 // LINALG:             %[[DEST_VIEW:.*]] = memref.subview %[[DEST]][0, 0] [%[[VAL_20]], %[[VAL_21]]] [1, 1]
-// LINALG:             memref.copy %[[VAL_22]], %[[DEST_VIEW]]
-// LINALG-SAME:            : memref<?x?xf32, #[[MAP5]]> to memref<?x?xf32, #[[MAP0]]>
+// LINALG:             linalg.copy(%[[VAL_22]], %[[DEST_VIEW]])
+// LINALG-SAME:            : memref<?x?xf32, #[[MAP5]]>, memref<?x?xf32, #[[MAP0]]>
 // LINALG:           }
 // LINALG:           return
 // LINALG:         }

diff  --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
index d7c10cb940fc..0b56b7b2dabf 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=memref.copy register-tile-sizes=4,32 vectorize" | \
+// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \
 
 // RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
 // RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \

diff  --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index e1111bd2ab3c..bb75e5b6d389 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -1194,7 +1194,7 @@ func @clone_loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2
     memref.dealloc %5 : memref<2xf32>
     scf.yield %6 : memref<2xf32>
   }
-  memref.copy %2, %arg4 : memref<2xf32> to memref<2xf32>
+  linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32>
   memref.dealloc %2 : memref<2xf32>
   return
 }
@@ -1204,7 +1204,7 @@ func @clone_loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2
 // CHECK-NEXT: memref.dealloc
 // CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc
 // CHECK-NEXT: scf.yield %[[ALLOC2]]
-// CHECK: memref.copy %[[ALLOC1]]
+// CHECK: linalg.copy(%[[ALLOC1]]
 // CHECK-NEXT: memref.dealloc %[[ALLOC1]]
 
 // -----

diff  --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
index 3c0249e4825e..29ab74a06aa7 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
@@ -144,7 +144,7 @@ struct TestLinalgCodegenStrategy
           "Split vector transfers between slow (masked) and fast "
           "(unmasked) variants. Possible options are:\n"
           "\tnone: keep unsplit vector.transfer and pay the full price\n"
-          "\tmemref.copy: use linalg.fill + memref.copy for the slow path\n"
+          "\tlinalg-copy: use linalg.fill + linalg.copy for the slow path\n"
           "\tvector-transfers: use extra small unmasked vector.transfer for"
           " the slow path\n"),
       llvm::cl::init("none")};
@@ -167,7 +167,7 @@ struct TestLinalgCodegenStrategy
           "latch on:\n"
           "\tlinalg.matmul: anchor on linalg.matmul\n"
           "\tlinalg.matmul_column_major: anchor on linalg.matmul_column_major\n"
-          "\tmemref.copy: anchor on memref.copy\n"
+          "\tlinalg.copy: anchor on linalg.copy\n"
           "\tlinalg.fill: anchor on linalg.fill\n"),
       llvm::cl::init("")};
   Option<std::string> anchorFuncOpName{
@@ -305,7 +305,7 @@ void TestLinalgCodegenStrategy::runOnOperation() {
       llvm::StringSwitch<vector::VectorTransferSplit>(
           splitVectorTransfersTo.getValue())
           .Case("none", vector::VectorTransferSplit::None)
-          .Case("memref-copy", vector::VectorTransferSplit::LinalgCopy)
+          .Case("linalg-copy", vector::VectorTransferSplit::LinalgCopy)
           .Case("vector-transfers", vector::VectorTransferSplit::VectorTransfer)
           .Default(vector::VectorTransferSplit::None);
 

diff  --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 32f4538a4cbf..a5f83afcefa7 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -81,7 +81,7 @@ struct TestLinalgTransforms
   Option<bool> testVectorTransferForwardingPatterns{
       *this, "test-vector-transfer-forwarding-patterns",
       llvm::cl::desc(
-          "Test a fused pass that forwards memref.copy to vector.transfer"),
+          "Test a fused pass that forwards linalg.copy to vector.transfer"),
       llvm::cl::init(false)};
   Option<bool> testGenericToVectorPattern{
       *this, "test-linalg-to-vector-patterns",
@@ -232,8 +232,7 @@ static void applyPatterns(FuncOp funcOp) {
   //===--------------------------------------------------------------------===//
   patterns.add<LinalgVectorizationPattern>(
       ctx, LinalgTransformationFilter(StringAttr::get(ctx, "VECTORIZE"))
-               .addOpFilter<MatmulOp, FillOp, GenericOp>());
-  patterns.add<CopyVectorizationPattern>(ctx);
+               .addOpFilter<MatmulOp, FillOp, CopyOp, GenericOp>());
 
   //===--------------------------------------------------------------------===//
   // Linalg generic interchange pattern.
@@ -302,8 +301,7 @@ static void fillL1TilingAndMatmulToVectorPatterns(
                MatmulOp::getOperationName(), ctx, LinalgVectorizationOptions(),
                LinalgTransformationFilter(StringAttr::get(ctx, "VEC"))));
   patternsVector.back().add<LinalgVectorizationPattern>(
-      ctx, LinalgTransformationFilter().addOpFilter<FillOp>());
-  patternsVector.back().add<CopyVectorizationPattern>(ctx);
+      ctx, LinalgTransformationFilter().addOpFilter<FillOp, CopyOp>());
 }
 
 //===----------------------------------------------------------------------===//
@@ -341,7 +339,7 @@ static LogicalResult copyCallBackFn(OpBuilder &b, Value src, Value dst,
                                             FloatAttr::get(floatType, 42.0));
     b.create<FillOp>(src.getLoc(), cst, dst);
   }
-  b.create<memref::CopyOp>(src.getLoc(), src, dst);
+  b.create<CopyOp>(src.getLoc(), src, dst);
   return success();
 }
 
@@ -548,11 +546,10 @@ static void applyVectorTransferForwardingPatterns(FuncOp funcOp) {
 
 static void applyLinalgToVectorPatterns(FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  auto *ctx = funcOp.getContext();
   patterns.add<LinalgVectorizationPattern>(
-      ctx, LinalgTransformationFilter()
-               .addOpFilter<ContractionOpInterface, FillOp, GenericOp>());
-  patterns.add<CopyVectorizationPattern>(ctx);
+      funcOp.getContext(),
+      LinalgTransformationFilter()
+          .addOpFilter<ContractionOpInterface, FillOp, CopyOp, GenericOp>());
   populatePadOpVectorizationPatterns(patterns);
   populateConvolutionVectorizationPatterns(patterns);
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));

diff  --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index b644064b032b..db89fd8cf1c4 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -470,9 +470,9 @@ struct TestVectorTransferFullPartialSplitPatterns
   }
 
   Option<bool> useLinalgOps{
-      *this, "use-memref-copy",
+      *this, "use-linalg-copy",
       llvm::cl::desc("Split using a unmasked vector.transfer + linalg.fill + "
-                     "memref.copy operations."),
+                     "linalg.copy operations."),
       llvm::cl::init(false)};
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();