[Mlir-commits] [mlir] 8e83d5e - [mlir] Remove old VectorOps directory

Wed Mar 25 12:22:09 PDT 2020

Author: River Riddle
Date: 2020-03-25T12:21:57-07:00
New Revision: 8e83d5ea3e03e90d63f89e9f2fb7000e9b024b53

URL: https://github.com/llvm/llvm-project/commit/8e83d5ea3e03e90d63f89e9f2fb7000e9b024b53
DIFF: https://github.com/llvm/llvm-project/commit/8e83d5ea3e03e90d63f89e9f2fb7000e9b024b53.diff

LOG: [mlir] Remove old VectorOps directory

This was accidentally re-added during a rebase.

Added: 
    

Modified: 
    

Removed: 
    mlir/include/mlir/Dialect/VectorOps/VectorOps.td
    mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
    mlir/lib/Dialect/VectorOps/VectorUtils.cpp


################################################################################
diff  --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
deleted file mode 100644
index ef1fde00e5ec..000000000000

--- a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
+++ /dev/null
@@ -1,1402 +0,0 @@
-//===- VectorOps.td - Vector op definitions ---------------*- tablegen -*-====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines MLIR vector operations.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef VECTOR_OPS
-#define VECTOR_OPS
-
-include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
-include "mlir/Interfaces/SideEffects.td"
-
-
-def VectorOps_Dialect : Dialect {
-  let name = "vector";
-  let cppNamespace = "vector";
-  let hasConstantMaterializer = 1;
-}
-
-// Base class for Vector dialect ops.
-class Vector_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<VectorOps_Dialect, mnemonic, traits> {
-  // For every vector op, there needs to be a:
-  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
-  //   * LogicalResult verify(${C++ class of Op} op)
-  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
-  //                                         OperationState &result)
-  // functions.
-  let printer = [{ return ::print(p, *this); }];
-  let verifier = [{ return ::verify(*this); }];
-  let parser = [{ return ::parse$cppClass(parser, result); }];
-}
-
-// TODO(andydavis, ntv) Add an attribute to specify a 
diff erent algebra
-// with operators other than the current set: {*, +}.
-def Vector_ContractionOp :
-  Vector_Op<"contract", [NoSideEffect,
-     PredOpTrait<"first operand lhs and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>,
-     PredOpTrait<"second operand rhs and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 1>>,
-     PredOpTrait<"third operand acc and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
-    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyType:$acc,
-               Variadic<VectorOf<[I1]>>:$masks,
-               AffineMapArrayAttr:$indexing_maps, ArrayAttr:$iterator_types)>,
-    Results<(outs AnyType)> {
-  let summary = "vector contraction operation";
-  let description = [{
-    Computes the sum of products of vector elements along contracting
-    dimension pairs from 2 vectors of rank M and N respectively, adds this
-    intermediate result to the accumulator argument of rank K, and returns a
-    vector result of rank K (where K = num_lhs_free_dims + num_rhs_free_dims +
-    num_batch_dims (see dimension type descriptions below)). For K = 0 (no
-    free or batch dimensions), the accumulator and output are a scalar.
-
-    Optional vector mask arguments (produced by CreateMaskOp or ConstantMaskOp)
-    specify the dynamic dimension sizes of valid data within the lhs/rhs vector
-    arguments.
-
-    An iterator type attribute list must be specified, where each element of
-    the list represents an iterator with one of the following types:
-
-    *) "reduction": reduction dimensions are present in the lhs and rhs
-                    arguments but not in the output (and accumulator
-                    argument). These are the dimensions along which the vector
-                    contraction op computes the sum of products, and
-                    contracting dimension pair dimension sizes must match
-                    between lhs/rhs.
-    *) "parallel": Batch dimensions are iterator type "parallel", and
-                   are non-contracting dimensions present in the lhs, rhs and
-                   output. The lhs/rhs co-iterate along the batch dimensions,
-                   which should be expressed in their indexing maps.
-
-                   Free dimensions are iterator type "parallel", and are
-                   non-contraction, non-batch dimensions accessed by either the
-                   lhs or rhs (but not both). The lhs and rhs free dimensions
-                   are unrelated to each other and do not co-iterate, which
-                   should be expressed in their indexing maps.
-
-    An indexing map attribute list must be specified with an entry for lhs, rhs
-    and acc arguments. An indexing map attribute specifies a mapping from each
-    iterator in the iterator type list, to each dimension of an N-D vector.
-
-    Examples:
-
-      // Simple dot product (K = 0).
-      #contraction_accesses = [
-       affine_map<(i) -> (i)>,
-       affine_map<(i) -> (i)>,
-       affine_map<(i) -> ()>
-      ]
-      #contraction_trait = {
-        indexing_maps = #contraction_accesses,
-        iterator_types = ["reduction"]
-      }
-      %3 = vector.contract #contraction_trait %0, %1, %2
-        : vector<10xf32>, vector<10xf32> into f32
-
-      // 2D vector contraction with one contracting dimension (matmul, K = 2).
-      #contraction_accesses = [
-        affine_map<(i, j, k) -> (i, k)>,
-        affine_map<(i, j, k) -> (k, j)>,
-        affine_map<(i, j, k) -> (i, j)>
-      ]
-      #contraction_trait = {
-        indexing_maps = #contraction_accesses,
-        iterator_types = ["parallel", "parallel", "reduction"]
-      }
-
-      %3 = vector.contract #contraction_trait %0, %1, %2
-        : vector<4x3xf32>, vector<3x7xf32> into vector<4x7xf32>
-
-      // 4D to 3D vector contraction with two contracting dimensions and
-      // one batch dimension (K = 3).
-      #contraction_accesses = [
-        affine_map<(b0, f0, f1, c0, c1) -> (c0, b0, c1, f0)>,
-        affine_map<(b0, f0, f1, c0, c1) -> (b0, c1, c0, f1)>,
-        affine_map<(b0, f0, f1, c0, c1) -> (b0, f0, f1)>
-      ]
-      #contraction_trait = {
-        indexing_maps = #contraction_accesses,
-        iterator_types = ["parallel", "parallel", "parallel",
-                          "reduction", "reduction"]
-      }
-
-      %4 = vector.contract #contraction_trait %0, %1, %2
-          : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
-
-      // 4D vector contraction with two contracting dimensions and optional
-      // vector mask arguments.
-      %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
-      %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
-
-      %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask
-         : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
-  }];
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value lhs, Value rhs, "
-    "Value acc, ArrayAttr indexingMaps, ArrayAttr iteratorTypes">,
-    OpBuilder<
-      "Builder *builder, OperationState &result, Value lhs, Value rhs, "
-      "Value acc, ArrayRef<ArrayRef<AffineExpr>> indexingExprs, "
-      "ArrayRef<StringRef> iteratorTypes">];
-  let extraClassDeclaration = [{
-    VectorType getLhsType() {
-      return lhs().getType().cast<VectorType>();
-    }
-    VectorType getRhsType() {
-      return rhs().getType().cast<VectorType>();
-    }
-    Type getAccType() { return acc().getType(); }
-    VectorType getLHSVectorMaskType() {
-      if (llvm::size(masks()) != 2) return VectorType();
-      return getOperand(3).getType().cast<VectorType>();
-    }
-    VectorType getRHSVectorMaskType() {
-      if (llvm::size(masks()) != 2) return VectorType();
-      return getOperand(4).getType().cast<VectorType>();
-    }
-    Type getResultType() { return getResult().getType(); }
-    ArrayRef<StringRef> getTraitAttrNames();
-    SmallVector<AffineMap, 4> getIndexingMaps();
-    static unsigned getAccOperandIndex() { return 2; }
-
-    // Returns the bounds of each dimension in the iteration space spanned
-    // by the iterator types of this operation.
-    void getIterationBounds(SmallVectorImpl<int64_t> &iterationBounds);
-
-    // Returns a list of index maps, where there is a list entry for each
-    // op indexing map attribute (i.e. one for each input and output, with
-    // the output listed last). Each index map, maps from this operations
-    // iteration space, to vector dimensions of the maps input/output.
-    void getIterationIndexMap(
-      std::vector<DenseMap<int64_t, int64_t>> &iterationIndexMap);
-
-    std::vector<std::pair<int64_t, int64_t>> getContractingDimMap();
-    std::vector<std::pair<int64_t, int64_t>> getBatchDimMap();
-  }];
-}
-
-def Vector_ReductionOp :
-  Vector_Op<"reduction", [NoSideEffect,
-     PredOpTrait<"source operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins StrAttr:$kind, AnyVector:$vector, Variadic<AnyType>:$acc)>,
-    Results<(outs AnyType:$dest)> {
-  let summary = "reduction operation";
-  let description = [{
-    Reduces an 1-D vector "horizontally" into a scalar using the given
-    operation (add/mul/min/max for int/fp and and/or/xor for int only).
-    Some reductions (add/mul for fp) also allow an optional fused
-    accumulator.
-
-    Note that these operations are restricted to 1-D vectors to remain
-    close to the corresponding LLVM intrinsics:
-
-    http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics
-
-    Examples:
-    ```
-      %1 = vector.reduction "add", %0 : vector<16xf32> into f32
-
-      %3 = vector.reduction "xor", %2 : vector<4xi32> into i32
-
-      %4 = vector.reduction "mul", %0, %1 : vector<16xf32> into f32
-    ```
-  }];
-  let extraClassDeclaration = [{
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_BroadcastOp :
-  Vector_Op<"broadcast", [NoSideEffect,
-     PredOpTrait<"source operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins AnyType:$source)>,
-    Results<(outs AnyVector:$vector)> {
-  let summary = "broadcast operation";
-  let description = [{
-    Broadcasts the scalar or k-D vector value in the source operand
-    to a n-D result vector such that the broadcast makes sense, i.e.,
-    the source operand is duplicated to match the given rank and sizes
-    in the result vector. The legality rules are:
-    * the source operand must have the same element type as the result type
-    * a k-D vector <s_1 x .. x s_k x type> can be broadcast to
-      a n-D vector <t_1 x .. x t_n x type> if
-       * k <= n, and
-       * the sizes in the trailing dimensions n-k < i <= n with j=i+k-n
-          match exactly as s_j = t_i or s_j = 1:
-       ```
-           t_1 x   ..  t_n-k x t_n-k+1 x .. x t_i x .. x t_n
-                               s_1     x .. x s_j x .. x s_k
-               <duplication>         <potential stretch>
-       ```
-    The source operand is duplicated over all the missing leading dimensions
-    and stretched over the trailing dimensions where the source has a non-equal
-    dimension of 1. These rules imply that any scalar broadcast (k=0) to any
-    shaped vector with the same element type is always legal.
-
-    Examples:
-    ```
-      %0 = constant 0.0 : f32
-      %1 = vector.broadcast %0 : f32 to vector<16xf32>
-      %2 = vector.broadcast %1 : vector<16xf32> to vector<4x16xf32>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    Type getSourceType() { return source().getType(); }
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($vector)";
-}
-
-def Vector_ShuffleOp :
-  Vector_Op<"shuffle", [NoSideEffect,
-     PredOpTrait<"first operand v1 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>,
-     PredOpTrait<"second operand v2 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
-     Arguments<(ins AnyVector:$v1, AnyVector:$v2, I64ArrayAttr:$mask)>,
-     Results<(outs AnyVector:$vector)> {
-  let summary = "shuffle operation";
-  let description = [{
-    The shuffle operation constructs a permutation (or duplication) of elements
-    from two input vectors, returning a vector with the same element type as
-    the input and a length that is the same as the shuffle mask. The two input
-    vectors must have the same element type, rank, and trailing dimension sizes
-    and shuffles their values in the leading dimension (which may 
diff er in size)
-    according to the given mask. The legality rules are:
-    * the two operands must have the same element type as the result
-    * the two operands and the result must have the same rank and trailing
-      dimension sizes, viz. given two k-D operands
-              v1 : <s_1 x s_2 x .. x s_k x type> and
-              v2 : <t_1 x t_2 x .. x t_k x type>
-      we have s_i = t_i for all 1 < i <= k
-    * the mask length equals the leading dimension size of the result
-    * numbering the input vector indices left to right across the operands, all
-      mask values must be within range, viz. given two k-D operands v1 and v2
-      above, all mask values are in the range [0,s_1+t_1)
-
-    Examples:
-    ```
-    %0 = vector.shuffle %a, %b[0, 3]
-               : vector<2xf32>, vector<2xf32>       ; yields vector<2xf32>
-    %1 = vector.shuffle %c, %b[0, 1, 2]
-               : vector<2x16xf32>, vector<1x16xf32> ; yields vector<3x16xf32>
-    %2 = vector.shuffle %a, %b[3, 2, 1, 0]
-               : vector<2xf32>, vector<2xf32>       ; yields vector<4xf32>
-
-    ```
-  }];
-  let builders = [OpBuilder<"Builder *builder, OperationState &result,"
-                            "Value v1, Value v2, ArrayRef<int64_t>">];
-  let extraClassDeclaration = [{
-    static StringRef getMaskAttrName() { return "mask"; }
-    VectorType getV1VectorType() {
-      return v1().getType().cast<VectorType>();
-    }
-    VectorType getV2VectorType() {
-      return v2().getType().cast<VectorType>();
-    }
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_ExtractElementOp :
-  Vector_Op<"extractelement", [NoSideEffect,
-     TypesMatchWith<"result type matches element type of vector operand",
-                    "vector", "result",
-                    "$_self.cast<ShapedType>().getElementType()">]>,
-    Arguments<(ins AnyVector:$vector, AnySignlessInteger:$position)>,
-    Results<(outs AnyType:$result)> {
-  let summary = "extractelement operation";
-  let description = [{
-    Takes an 1-D vector and a dynamic index position and extracts the
-    scalar at that position. Note that this instruction resembles
-    vector.extract, but is restricted to 1-D vectors and relaxed
-    to dynamic indices. It is meant to be closer to LLVM's version:
-    https://llvm.org/docs/LangRef.html#extractelement-instruction
-
-    Example:
-    ```
-      %c = constant 15 : i32
-      %1 = vector.extractelement %0[%c : i32]: vector<16xf32>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-
-  let assemblyFormat = [{
-    $vector `[` $position `:` type($position) `]` attr-dict `:` type($vector)
-  }];
-}
-
-def Vector_ExtractOp :
-  Vector_Op<"extract", [NoSideEffect,
-     PredOpTrait<"operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$position)>,
-    Results<(outs AnyType)> {
-  let summary = "extract operation";
-  let description = [{
-    Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at
-    the proper position. Degenerates to an element type in the 0-D case.
-
-    Examples:
-    ```
-      %1 = vector.extract %0[3]: vector<4x8x16xf32>
-      %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32>
-    ```
-  }];
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value source,"
-    "ArrayRef<int64_t>">];
-  let extraClassDeclaration = [{
-    static StringRef getPositionAttrName() { return "position"; }
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_ExtractSlicesOp :
-  Vector_Op<"extract_slices", [NoSideEffect]>,
-    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$sizes,
-                   I64ArrayAttr:$strides)>,
-    Results<(outs TupleOf<[AnyVector]>)> {
-  let summary = "vector extract slices operation";
-  let description = [{
-    Takes an N-d vector and returns a tuple of vector slices of 'vector',
-    based on 'sizes' and 'strides' parameters.
-
-    The arguments 'sizes' and 'strides' represent a specification for
-    generating the unrolling of 'vector' shape, which has all slices of shape
-    'sizes' except for slices at dimension boundaries when 'vector' dimension
-    sizes are not a multiple of 'sizes'.
-
-    Each slice is returned at the tuple element index corresponding to the
-    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
-    Currently, only unit strides are supported.
-
-    Examples:
-    ```
-      %0 = vector.transfer_read ...: vector<4x2xf32>
-
-      %1 = vector.extract_slices %0, [2, 2], [1, 1]
-        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
-
-      // Example with partial slices at dimension boundaries.
-      %2 = vector.transfer_read ...: vector<4x3xf32>
-
-      %3 = vector.extract_slices %2, [2, 2], [1, 1]
-        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
-                                     vector<2x2xf32>, vector<2x1xf32>>
-    ```
-  }];
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, TupleType tupleType, " #
-    "Value vector, ArrayRef<int64_t> sizes, " #
-    "ArrayRef<int64_t> strides">];
-  let extraClassDeclaration = [{
-    VectorType getSourceVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-    TupleType getResultTupleType() {
-      return getResult().getType().cast<TupleType>();
-    }
-    void getSizes(SmallVectorImpl<int64_t> &results);
-    void getStrides(SmallVectorImpl<int64_t> &results);
-    static StringRef getSizesAttrName() { return "sizes"; }
-    static StringRef getStridesAttrName() { return "strides"; }
-  }];
-  let assemblyFormat = [{
-    $vector `,` $sizes `,` $strides attr-dict `:` type($vector) `into`
-    type(results)
-  }];
-}
-
-def Vector_FMAOp :
-  Op<VectorOps_Dialect, "fma", [NoSideEffect,
-                             AllTypesMatch<["lhs", "rhs", "acc", "result"]>]>,
-    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc)>,
-    Results<(outs AnyVector:$result)> {
-  let summary = "vector fused multiply-add";
-  let description = [{
-    Multiply-add expressions operate on n-D vectors and compute a fused
-    pointwise multiply-and-accumulate: `$result = `$lhs * $rhs + $acc`.
-    All operands and result have the same vector type. The semantics
-    of the operation correspond to those of the `llvm.fma`
-    [intrinsic](https://llvm.org/docs/LangRef.html#int-fma). In the
-    particular case of lowering to LLVM, this is guaranteed to lower
-    to the `llvm.fma.*` intrinsic.
-
-    Example:
-
-    ```
-      %3 = vector.fma %0, %1, %2: vector<8x16xf32>
-    ```
-  }];
-  // Fully specified by traits.
-  let verifier = ?;
-  let assemblyFormat = "$lhs `,` $rhs `,` $acc attr-dict `:` type($lhs)";
-  let builders = [OpBuilder<
-    "Builder *b, OperationState &result, Value lhs, Value rhs, Value acc",
-    "build(b, result, lhs.getType(), lhs, rhs, acc);">];
-  let extraClassDeclaration = [{
-    VectorType getVectorType() { return lhs().getType().cast<VectorType>(); }
-  }];
-}
-
-def Vector_InsertElementOp :
-  Vector_Op<"insertelement", [NoSideEffect,
-     TypesMatchWith<"source operand type matches element type of result",
-                    "result", "source",
-                    "$_self.cast<ShapedType>().getElementType()">,
-     AllTypesMatch<["dest", "result"]>]>,
-     Arguments<(ins AnyType:$source, AnyVector:$dest,
-                    AnySignlessInteger:$position)>,
-     Results<(outs AnyVector:$result)> {
-  let summary = "insertelement operation";
-  let description = [{
-    Takes a scalar source, an 1-D destination vector and a dynamic index
-    position and inserts the source into the destination at the proper
-    position.  Note that this instruction resembles vector.insert, but
-    is restricted to 1-D vectors and relaxed to dynamic indices. It is
-    meant to be closer to LLVM's version:
-    https://llvm.org/docs/LangRef.html#insertelement-instruction
-
-    Example:
-    ```
-      %c = constant 15 : i32
-      %f = constant 0.0f : f32
-      %1 = vector.insertelement %f, %0[%c : i32]: vector<16xf32>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    Type getSourceType() { return source().getType(); }
-    VectorType getDestVectorType() {
-      return dest().getType().cast<VectorType>();
-    }
-  }];
-
-  let assemblyFormat = [{
-    $source `,` $dest `[` $position `:` type($position) `]` attr-dict `:`
-    type($result)
-  }];
-}
-
-def Vector_InsertOp :
-  Vector_Op<"insert", [NoSideEffect,
-     PredOpTrait<"source operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>,
-     AllTypesMatch<["dest", "res"]>]>,
-     Arguments<(ins AnyType:$source, AnyVector:$dest, I64ArrayAttr:$position)>,
-     Results<(outs AnyVector:$res)> {
-  let summary = "insert operation";
-  let description = [{
-    Takes an n-D source vector, an (n+k)-D destination vector and a k-D position
-    and inserts the n-D source into the (n+k)-D destination at the proper
-    position. Degenerates to a scalar source type when n = 0.
-
-    Examples:
-    ```
-      %2 = vector.insert %0, %1[3]:
-        vector<8x16xf32> into vector<4x8x16xf32>
-      %5 = vector.insert %3, %4[3, 3, 3]:
-        f32 into vector<4x8x16xf32>
-    ```
-  }];
-  let assemblyFormat = [{
-    $source `,` $dest $position attr-dict `:` type($source) `into` type($dest)
-  }];
-
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value source, " #
-    "Value dest, ArrayRef<int64_t>">];
-  let extraClassDeclaration = [{
-    static StringRef getPositionAttrName() { return "position"; }
-    Type getSourceType() { return source().getType(); }
-    VectorType getDestVectorType() {
-      return dest().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_InsertSlicesOp :
-  Vector_Op<"insert_slices", [NoSideEffect]>,
-    Arguments<(ins TupleOf<[AnyVector]>:$vectors, I64ArrayAttr:$sizes,
-                   I64ArrayAttr:$strides)>,
-    Results<(outs AnyVector)> {
-  let summary = "vector insert slices operation";
-  let description = [{
-    Takes a tuple of vector slices and inserts them into the vector result
-    according to the 'sizes' and 'strides' parameters.
-
-    The arguments 'sizes' and 'strides' represent a specification for
-    generating the unrolling of 'vector' shape, which has all slices of shape
-    'sizes' except for slices at dimension boundaries when 'vector' dimension
-    sizes are not a multiple of 'sizes'.
-
-    Each slice in 'vectors' is at the tuple element index corresponding to the
-    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
-    Currently, only unit strides are supported.
-
-    Examples:
-    ```
-      %0 = vector.extract_slices %0, [2, 2], [1, 1]
-        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
-
-      %1 = vector.insert_slices %0, [2, 2], [1, 1]
-        : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
-
-      // Example with partial slices at dimension boundaries.
-      %3 = vector.extract_slices %2, [2, 2], [1, 1]
-        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
-                                     vector<2x2xf32>, vector<2x1xf32>>
-
-      %4 = vector.insert_slices %3, [2, 2], [1, 1]
-        : tuple<vector<2x2xf32>, vector<2x1xf32>,
-                vector<2x2xf32>, vector<2x1xf32>> into vector<4x3xf32>
-    ```
-  }];
-
-  let extraClassDeclaration = [{
-    TupleType getSourceTupleType() {
-      return vectors().getType().cast<TupleType>();
-    }
-    VectorType getResultVectorType() {
-      return getResult().getType().cast<VectorType>();
-    }
-    void getSizes(SmallVectorImpl<int64_t> &results);
-    void getStrides(SmallVectorImpl<int64_t> &results);
-    static StringRef getSizesAttrName() { return "sizes"; }
-    static StringRef getStridesAttrName() { return "strides"; }
-  }];
-  let assemblyFormat = [{
-    $vectors `,` $sizes `,` $strides attr-dict `:` type($vectors) `into`
-    type(results)
-  }];
-}
-
-def Vector_InsertStridedSliceOp :
-  Vector_Op<"insert_strided_slice", [NoSideEffect,
-    PredOpTrait<"operand #0 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>,
-    AllTypesMatch<["dest", "res"]>]>,
-    Arguments<(ins AnyVector:$source, AnyVector:$dest, I64ArrayAttr:$offsets,
-               I64ArrayAttr:$strides)>,
-    Results<(outs AnyVector:$res)> {
-  let summary = "strided_slice operation";
-  let description = [{
-    Takes a k-D source vector, an n-D destination vector (n >= k), n-sized
-    `offsets` integer array attribute, a k-sized `strides` integer array attribute
-    and inserts the k-D source vector as a strided subvector at the proper offset
-    into the n-D destination vector.
-
-    At the moment strides must contain only 1s.
-
-    Returns an n-D vector that is a copy of the n-D destination vector in which
-    the last k-D dimensions contain the k-D source vector elements strided at
-    the proper location as specified by the offsets.
-
-    Examples:
-    ```
-      %2 = vector.insert_strided_slice %0, %1
-          {offsets = [0, 0, 2], strides = [1, 1]}:
-        vector<2x4xf32> into vector<16x4x8xf32>
-    ```
-  }];
-
-  let assemblyFormat = [{
-    $source `,` $dest attr-dict `:` type($source) `into` type($dest)
-  }];
-
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value source, Value dest, " #
-    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> strides">];
-  let extraClassDeclaration = [{
-    static StringRef getOffsetsAttrName() { return "offsets"; }
-    static StringRef getStridesAttrName() { return "strides"; }
-    VectorType getSourceVectorType() {
-      return source().getType().cast<VectorType>();
-    }
-    VectorType getDestVectorType() {
-      return dest().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_OuterProductOp :
-  Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>,
-    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic<AnyVector>:$acc)>,
-    Results<(outs AnyVector)> {
-  let summary = "vector outerproduct with optional fused add";
-  let description = [{
-    Takes 2 1-D vectors and returns the 2-D vector containing the outer-product.
-
-    An optional extra 2-D vector argument may be specified in which case the
-    operation returns the sum of the outer-product and the extra vector. In this
-    multiply-accumulate scenario, the rounding mode is that obtained by
-    guaranteeing that a fused-multiply add operation is emitted. When lowered to
-    the LLVMIR dialect, this form emits `llvm.intr.fma`, which is guaranteed to
-    lower to actual `fma` instructions on x86.
-
-    Examples:
-
-    ```
-      %2 = vector.outerproduct %0, %1: vector<4xf32>, vector<8xf32>
-      return %2: vector<4x8xf32>
-
-      %3 = vector.outerproduct %0, %1, %2:
-        vector<4xf32>, vector<8xf32>, vector<4x8xf32>
-      return %3: vector<4x8xf32>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    VectorType getOperandVectorTypeLHS() {
-      return lhs().getType().cast<VectorType>();
-    }
-    VectorType getOperandVectorTypeRHS() {
-      return rhs().getType().cast<VectorType>();
-    }
-    VectorType getOperandVectorTypeACC() {
-      return (llvm::size(acc()) == 0) ? VectorType() :
-        (*acc().begin()).getType().cast<VectorType>();
-    }
-    VectorType getVectorType() {
-      return getResult().getType().cast<VectorType>();
-    }
-  }];
-}
-
-// TODO(andydavis) Add transformation which decomposes ReshapeOp into an
-// optimized sequence of vector rotate/shuffle/select operations.
-def Vector_ReshapeOp :
-  Vector_Op<"reshape", [AttrSizedOperandSegments, NoSideEffect]>,
-    Arguments<(ins AnyVector:$vector, Variadic<Index>:$input_shape,
-               Variadic<Index>:$output_shape,
-               I64ArrayAttr:$fixed_vector_sizes)>,
-    Results<(outs AnyVector:$result)> {
-  let summary = "vector reshape operation";
-  let description = [{
-    Reshapes its vector operand from 'input_shape' to 'output_shape' maintaining
-    fixed vector dimension 'fixed_vector_sizes' on the innermost vector
-    dimensions.
-
-    The parameters 'input_shape' and 'output_shape' represent valid data shapes
-    across fixed vector shapes. For example, if a vector has a valid data
-    shape [6] with fixed vector size [8], then the valid data elements are
-    assumed to be stored at the beginning of the vector with the remaining
-    vector elements undefined.
-
-    In the examples below, valid data elements are represented by an alphabetic
-    character, and undefined data elements are represented by '-'.
-
-    Example
-
-      vector<1x8xf32> with valid data shape [6], fixed vector sizes [8]
-
-                input: [a, b, c, d, e, f]
-
-           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
-
-        vector layout: [a, b, c, d, e, f, -, -]
-
-    Example
-
-      vector<2x8xf32> with valid data shape [10], fixed vector sizes [8]
-
-                input: [a, b, c, d, e, f, g, h, i, j]
-
-           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
-
-        vector layout: [[a, b, c, d, e, f, g, h],
-                        [i, j, -, -, -, -, -, -]]
-
-    Example
-
-      vector<2x2x2x3xf32> with valid data shape [3, 5], fixed vector sizes
-      [2, 3]
-
-                input: [[a, b, c, d, e],
-                        [f, g, h, i, j],
-                        [k, l, m, n, o]]
-
-           layout map: (d0, d1) -> (d0 floordiv 3, d1 floordiv 5,
-                                    d0 mod 3, d1 mod 5)
-
-        vector layout: [[[[a, b, c],
-                          [f, g, h]]
-                         [[d, e, -],
-                          [i, j, -]]],
-                        [[[k, l, m],
-                          [-, -, -]]
-                         [[n, o, -],
-                          [-, -, -]]]]
-
-    Example
-
-      %1 = vector.reshape %0, [%c3, %c6], [%c2, %c9], [4]
-        : vector<3x2x4xf32> to vector<2x3x4xf32>
-
-             input: [[a, b, c, d, e, f],
-                     [g, h, i, j, k, l],
-                     [m, n, o, p, q, r]]
-
-        layout map: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)
-
-
-      Input vector:  [[[a, b, c, d],
-                       [e, f, -, -]],
-                      [[g, h, i, j],
-                       [k, l, -, -]],
-                      [[m, n, o, p],
-                       [q, r, -, -]]]
-
-      Output vector:  [[[a, b, c, d],
-                        [e, f, g, h],
-                        [i, -, -, -]],
-                       [[j, k, l, m],
-                        [n, o, p, q],
-                        [r, -, -, -]]]
-  }];
-
-  let extraClassDeclaration = [{
-    VectorType getInputVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-    VectorType getOutputVectorType() {
-      return getResult().getType().cast<VectorType>();
-    }
-
-    /// Returns as integer value the number of input shape operands.
-    int64_t getNumInputShapeSizes() { return input_shape().size(); }
-
-    /// Returns as integer value the number of output shape operands.
-    int64_t getNumOutputShapeSizes() { return output_shape().size(); }
-
-    void getFixedVectorSizes(SmallVectorImpl<int64_t> &results);
-
-    static StringRef getFixedVectorSizesAttrName() {
-      return "fixed_vector_sizes";
-    }
-    static StringRef getInputShapeAttrName() { return "input_shape"; }
-    static StringRef getOutputShapeAttrName() { return "output_shape"; }
-  }];
-
-  let assemblyFormat = [{
-    $vector `,` `[` $input_shape `]` `,` `[` $output_shape `]` `,`
-    $fixed_vector_sizes attr-dict `:` type($vector) `to` type($result)
-  }];
-}
-
-def Vector_StridedSliceOp :
-  Vector_Op<"strided_slice", [NoSideEffect,
-    PredOpTrait<"operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$offsets,
-               I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>,
-    Results<(outs AnyVector)> {
-  let summary = "strided_slice operation";
-  let description = [{
-    Takes an n-D vector, k-D `offsets` integer array attribute, a k-sized
-    `sizes` integer array attribute, a k-sized `strides` integer array
-    attribute and extracts the n-D subvector at the proper offset.
-
-    At the moment strides must contain only 1s.
-    // TODO(ntv) support non-1 strides.
-
-    Returns an n-D vector where the first k-D dimensions match the `sizes`
-    attribute. The returned subvector contains the elements starting at offset
-    `offsets` and ending at `offsets + sizes`.
-
-    Examples:
-    ```
-      %1 = vector.strided_slice %0
-          {offsets = [0, 2], sizes = [2, 4], strides = [1, 1]}:
-        vector<4x8x16xf32> to vector<2x4x16xf32>
-    ```
-
-    // TODO(ntv) Evolve to a range form syntax similar to:
-    %1 = vector.strided_slice %0[0:2:1][2:4:1]
-      vector<4x8x16xf32> to vector<2x4x16xf32>
-  }];
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value source, " #
-    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> sizes, " #
-    "ArrayRef<int64_t> strides">];
-  let extraClassDeclaration = [{
-    static StringRef getOffsetsAttrName() { return "offsets"; }
-    static StringRef getSizesAttrName() { return "sizes"; }
-    static StringRef getStridesAttrName() { return "strides"; }
-    VectorType getVectorType(){ return vector().getType().cast<VectorType>(); }
-    void getOffsets(SmallVectorImpl<int64_t> &results);
-  }];
-  let hasCanonicalizer = 1;
-  let assemblyFormat = "$vector attr-dict `:` type($vector) `to` type(results)";
-}
-
-def Vector_TransferReadOp :
-  Vector_Op<"transfer_read">,
-    Arguments<(ins AnyMemRef:$memref, Variadic<Index>:$indices,
-               AffineMapAttr:$permutation_map, AnyType:$padding)>,
-    Results<(outs AnyVector:$vector)> {
-
-  let summary = "Reads a supervector from memory into an SSA vector value.";
-
-  let description = [{
-    The `vector.transfer_read` op performs a blocking read from a slice within
-    a [MemRef](../LangRef.md#memref-type) supplied as its first operand
-    into a [vector](../LangRef.md#vector-type) of the same base elemental type.
-
-    A memref operand with vector element type, must have its vector element
-    type match a suffix (shape and element type) of the vector (e.g.
-    memref<3x2x6x4x3xf32>, vector<1x1x4x3xf32>).
-
-    The slice is further defined by a full-rank index within the MemRef,
-    supplied as the operands `2 .. 1 + rank(memref)`. The permutation_map
-    [attribute](../LangRef.md#attributes) is an
-    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
-    slice to match the vector shape. The size of the slice is specified by the
-    size of the vector, given as the return type. An `ssa-value` of the same
-    elemental type as the MemRef is provided as the last operand to specify
-    padding in the case of out-of-bounds accesses. This operation is called
-    'read' by opposition to 'load' because the super-vector granularity is
-    generally not representable with a single hardware register.
-    A `vector.transfer_read` is thus a mid-level
-    abstraction that supports super-vectorization with non-effecting padding for
-    full-tile-only code.
-
-    More precisely, let's dive deeper into the permutation_map for the following
-    MLIR:
-
-    ```mlir
-    vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4]
-      { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } :
-      memref<?x?x?x?xf32>, vector<3x4x5xf32>
-    ```
-
-    This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3,
-    %expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice
-    is: `%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]`
-
-    That slice needs to be read into a `vector<3x4x5xf32>`. Since the
-    permutation map is not full rank, there must be a broadcast along vector
-    dimension `1`.
-
-    A notional lowering of vector.transfer_read could generate code resembling:
-
-    ```mlir
-    // %expr1, %expr2, %expr3, %expr4 defined before this point
-    %tmp = alloc() : vector<3x4x5xf32>
-    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
-    for %i = 0 to 3 {
-      affine.for %j = 0 to 4 {
-        affine.for %k = 0 to 5 {
-          %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
-            memref<?x?x?x?xf32>
-          store %tmp[%i, %j, %k] : vector<3x4x5xf32>
-    }}}
-    %c0 = constant 0 : index
-    %vec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
-    ```
-
-    On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that
-    the temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are
-    actually transferred between `%A` and `%tmp`.
-
-    Alternatively, if a notional vector broadcast operation were available, the
-    lowered code would resemble:
-
-    ```mlir
-    // %expr1, %expr2, %expr3, %expr4 defined before this point
-    %tmp = alloc() : vector<3x4x5xf32>
-    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
-    for %i = 0 to 3 {
-      affine.for %k = 0 to 5 {
-        %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
-          memref<?x?x?x?xf32>
-        store %tmp[%i, 0, %k] : vector<3x4x5xf32>
-    }}
-    %c0 = constant 0 : index
-    %tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
-    %vec = broadcast %tmpvec, 1 : vector<3x4x5xf32>
-    ```
-
-    where `broadcast` broadcasts from element 0 to all others along the
-    specified dimension. This time, the temporary storage footprint is `3 * 5`
-    values which is the same amount of data as the `3 * 5` values transferred.
-    An additional `1` broadcast is required. On a GPU this broadcast could be
-    implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`.
-
-    Syntax
-    ```
-    operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list
-      `{` attribute-entry `} :` memref-type `,` vector-type
-    ```
-
-    Examples:
-
-    ```mlir
-    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32>
-    // and pad with %f0 to handle the boundary case:
-    %f0 = constant 0.0f : f32
-    for %i0 = 0 to %0 {
-      affine.for %i1 = 0 to %1 step 256 {
-        affine.for %i2 = 0 to %2 step 32 {
-          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
-               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
-               memref<?x?x?xf32>, vector<32x256xf32>
-    }}}
-
-    // Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into
-    // vector<128xf32>. The underlying implementation will require a 1-D vector
-    // broadcast:
-    for %i0 = 0 to %0 {
-      affine.for %i1 = 0 to %1 {
-        %3 = vector.transfer_read %A[%i0, %i1]
-             {permutation_map: (d0, d1) -> (0)} :
-             memref<?x?xf32>, vector<128xf32>
-      }
-    }
-
-    // Read from a memref with vector element type.
-    %4 = vector.transfer_read %arg1[%c3, %c3], %vf0
-      {permutation_map = (d0, d1)->(d0, d1)}
-        : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
-    ```
-  }];
-
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return memref().getType().cast<MemRefType>();
-    }
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-  }];
-}
-
-def Vector_TransferWriteOp :
-  Vector_Op<"transfer_write">,
-    Arguments<(ins AnyVector:$vector, AnyMemRef:$memref,
-               Variadic<Index>:$indices,
-               AffineMapAttr:$permutation_map)> {
-
-  let summary = "The vector.transfer_write op writes a supervector to memory.";
-
-  let description = [{
-    The `vector.transfer_write` performs a blocking write from a
-    [vector](../LangRef.md#vector-type), supplied as its first operand, into a
-    slice within a [MemRef](../LangRef.md#memref-type) of the same base
-    elemental type, supplied as its second operand.
-
-    A vector memref operand must have its vector element type match a suffix
-    (shape and element type) of the vector (e.g. memref<3x2x6x4x3xf32>,
-    vector<1x1x4x3xf32>).
-
-    The slice is further defined by a full-rank index within the MemRef,
-    supplied as the operands `3 .. 2 + rank(memref)`.
-    The permutation_map [attribute](../LangRef.md#attributes) is an
-    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
-    slice to match the vector shape. The size of the slice is specified by the
-    size of the vector. This operation is called 'write' by opposition to
-    'store' because the super-vector granularity is generally not representable
-    with a single hardware register. A `vector.transfer_write` is thus a
-    mid-level abstraction that supports super-vectorization with non-effecting
-    padding for full-tile-only code. It is the responsibility of
-    `vector.transfer_write`'s implementation to ensure the memory writes are
-    valid. Different lowerings may be pertinent depending on the hardware
-    support.
-
-    Syntax:
-
-    ```
-    operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} :
-      ` vector-type ', ' memref-type '
-    ```
-
-    Examples:
-
-    ```mlir
-    // write vector<16x32x64xf32> into the slice
-    //   `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`:
-    for %i0 = 0 to %0 {
-      affine.for %i1 = 0 to %1 step 32 {
-        affine.for %i2 = 0 to %2 step 64 {
-          affine.for %i3 = 0 to %3 step 16 {
-            %val = `ssa-value` : vector<16x32x64xf32>
-            vector.transfer_write %val, %A[%i0, %i1, %i2, %i3]
-              {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
-              vector<16x32x64xf32>, memref<?x?x?x?xf32>
-    }}}}
-
-    // write to a memref with vector element type.
-    vector.transfer_write %4, %arg1[%c3, %c3]
-      {permutation_map = (d0, d1)->(d0, d1)}
-        : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
-    ```
-  }];
-
-  let extraClassDeclaration = [{
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-    MemRefType getMemRefType() {
-      return memref().getType().cast<MemRefType>();
-    }
-  }];
-  let assemblyFormat = [{
-    $vector `,` $memref `[` $indices `]` attr-dict `:` type($vector) `,`
-    type($memref)
-  }];
-}
-
-def Vector_ShapeCastOp :
-  Vector_Op<"shape_cast", [NoSideEffect]>,
-    Arguments<(ins AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$source)>,
-    Results<(outs AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$result)> {
-  let summary = "shape_cast casts between vector shapes";
-  let description = [{
-    The shape_cast operation casts between an n-D source vector shape and
-    a k-D result vector shape (the element type remains the same).
-
-    If reducing rank (n > k), result dimension sizes must be a product
-    of contiguous source dimension sizes.
-    If expanding rank (n < k), source dimensions must factor into a
-    contiguous sequence of destination dimension sizes.
-    Each source dim is expanded (or contiguous sequence of source dims combined)
-    in source dimension list order (i.e. 0 <= i < n), to produce a contiguous
-    sequence of result dims (or a single result dim), in result dimension list
-    order (i.e. 0 <= j < k). The product of all source dimension sizes and all
-    result dimension sizes must match.
-
-    If the source/result types are a tuple of vectors, the casting operation
-    described above is applied to each source/result tuple element pair.
-
-    It is currently assumed that this operation does not require moving data,
-    and that it will be folded away before lowering vector operations.
-
-    There is an exception to the folding expectation when targeting
-    llvm.intr.matrix operations. We need a type conversion back and forth from a
-    2-D MLIR vector to a 1-D flattened LLVM vector.shape_cast lowering to LLVM
-    is supported in that particular case, for now.
-
-    Examples:
-
-    ```mlir
-    // Example casting to a lower vector rank.
-    %1 = vector.shape_cast %0 : vector<5x1x4x3xf32> to vector<20x3xf32>
-
-    // Example casting to a higher vector rank.
-    %3 = vector.shape_cast %2 : vector<10x12x8xf32> to vector<5x2x3x4x8xf32>
-
-    // Example casting a tuple of vectors of same rank, where tuple elements
-    // may have 
diff erent shapes.
-    %5 = vector.shape_cast %4 : tuple<vector<3x4x2xf32>, vector<3x3x2xf32>> to
-                                tuple<vector<12x2xf32>, vector<9x2xf32>>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    VectorType getSourceVectorType() {
-      return source().getType().cast<VectorType>();
-    }
-    VectorType getResultVectorType() {
-      return getResult().getType().cast<VectorType>();
-    }
-  }];
-  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($result)";
-}
-
-def Vector_TypeCastOp :
-  Vector_Op<"type_cast", [NoSideEffect]>,
-    Arguments<(ins StaticShapeMemRefOf<[AnyType]>:$memref)>,
-    Results<(outs AnyMemRef)> {
-  let summary = "type_cast op converts a scalar memref to a vector memref";
-  let description = [{
-    Performs a conversion from a memref with scalar element to a memref with a
-    *single* vector element, copying the shape of the memref to the vector. This
-    is the minimal viable operation that is required to makeke
-    super-vectorization operational. It can be seen as a special case of the
-    `view` operation but scoped in the super-vectorization context.
-
-    Syntax:
-
-    ```
-    operation ::= `vector.type_cast` ssa-use : memref-type to memref-type
-    ```
-
-    Example:
-
-    ```mlir
-    %A  = alloc() : memref<5x4x3xf32>
-    %VA = vector.type_cast %A : memref<5x4x3xf32> to memref<vector<5x4x3xf32>>
-    ```
-  }];
-
-  let builders = [OpBuilder<
-    "Builder *builder, OperationState &result, Value source">];
-
-  let parser = [{
-    return impl::parseCastOp(parser, result);
-  }];
-
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return memref().getType().cast<MemRefType>();
-    }
-    MemRefType getResultMemRefType() {
-      return getResult().getType().cast<MemRefType>();
-    }
-  }];
-}
-
-def Vector_ConstantMaskOp :
-  Vector_Op<"constant_mask", [NoSideEffect]>,
-    Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>,
-    Results<(outs VectorOf<[I1]>)> {
-  let summary = "creates a constant vector mask";
-  let description = [{
-    Creates and returns a vector mask where elements of the result vector
-    are set to '0' or '1', based on whether the element indices are contained
-    within a hyper-rectangular region specified by the 'mask_dim_sizes'
-    array attribute argument. Each element of the 'mask_dim_sizes' array,
-    specifies an exclusive upper bound [0, mask-dim-size-element-value)
-    for a unique dimension in the vector result. The conjunction of the ranges
-    define a hyper-rectangular region within which elements values are set to 1
-    (otherwise element values are set to 0).
-
-    Example: create a constant vector mask of size 4x3xi1 with elements in range
-             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
-
-      %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
-
-      print %1
-                    columns
-                  0    1    2
-                |------------
-              0 | 1    1    0
-        rows  1 | 1    1    0
-              2 | 1    1    0
-              3 | 0    0    0
-  }];
-
-  let extraClassDeclaration = [{
-    static StringRef getMaskDimSizesAttrName() { return "mask_dim_sizes"; }
-  }];
-  let assemblyFormat = "$mask_dim_sizes attr-dict `:` type(results)";
-}
-
-def Vector_CreateMaskOp :
-  Vector_Op<"create_mask", [NoSideEffect]>,
-    Arguments<(ins Variadic<Index>:$operands)>, Results<(outs VectorOf<[I1]>)> {
-  let summary = "creates a vector mask";
-  let description = [{
-    Creates and returns a vector mask where elements of the result vector
-    are set to '0' or '1', based on whether the element indices are contained
-    within a hyper-rectangular region specified by the operands. Specifically,
-    each operand specifies a range [0, operand-value) for a unique dimension in
-    the vector result. The conjunction of the operand ranges define a
-    hyper-rectangular region within which elements values are set to 1
-    (otherwise element values are set to 0).
-
-    Example: create a vector mask of size 4x3xi1 where elements in range
-             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
-
-      %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
-
-      print %1
-                    columns
-                  0    1    2
-                |------------
-              0 | 1    1    0
-        rows  1 | 1    1    0
-              2 | 1    1    0
-              3 | 0    0    0
-  }];
-
-  let hasCanonicalizer = 1;
-  let assemblyFormat = "$operands attr-dict `:` type(results)";
-}
-
-def Vector_TupleOp :
-  Vector_Op<"tuple", [NoSideEffect]>,
-    Arguments<(ins Variadic<AnyVector>:$vectors)>,
-    Results<(outs TupleOf<[AnyVector]>)> {
-  let summary = "make tuple of vectors operation";
-  let description = [{
-    Returns a tuple of its operands 'vectors'.
-
-    Note that this operation is used during the vector op unrolling
-    transformation and should be removed before lowering to lower-level
-    dialects.
-
-
-    Examples:
-    ```
-      %0 = vector.transfer_read ... : vector<2x2xf32>
-      %1 = vector.transfer_read ... : vector<2x1xf32>
-      %2 = vector.transfer_read ... : vector<2x2xf32>
-      %3 = vector.transfer_read ... : vector<2x1xf32>
-
-      %4 = vector.tuple %0, %1, %2, %3
-        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>
-
-    ```
-  }];
-
-  let extraClassDeclaration = [{
-    TupleType getResultTupleType() {
-      return getResult().getType().cast<TupleType>();
-    }
-  }];
-}
-
-def Vector_TupleGetOp :
-  Vector_Op<"tuple_get", [NoSideEffect]>,
-    Arguments<(ins TupleOf<[AnyVector]>:$vectors, APIntAttr:$index)>,
-    Results<(outs AnyVector)> {
-  let summary = "vector tuple get operation";
-  let description = [{
-    Returns the tuple element of 'vectors' at 'index'.
-
-    Note that this operation is used during the vector op unrolling
-    transformation and should be removed before lowering to lower-level
-    dialects.
-
-    Examples:
-    ```
-      %4 = vector.tuple %0, %1, %2, %3
-        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>>
-
-      %5 = vector.tuple_get %4, 1
-        : tuple<vector<2x2xf32>, vector<2x1xf32>,
-                vector<2x2xf32>, vector<2x1xf32>>
-    ```
-  }];
-
-  let extraClassDeclaration = [{
-    VectorType getResultVectorType() {
-      return getResult().getType().cast<VectorType>();
-    }
-    int64_t getIndex() {
-      return getAttrOfType<IntegerAttr>("index").getValue().getSExtValue();
-    }
-    static StringRef getIndexAttrName() { return "index"; }
-  }];
-  let hasFolder = 1;
-}
-
-def Vector_PrintOp :
-  Vector_Op<"print", []>, Arguments<(ins AnyType:$source)> {
-  let summary = "print operation (for testing and debugging)";
-  let description = [{
-    Prints the source vector (or scalar) to stdout in human readable
-    format (for testing and debugging). No return value.
-
-    Examples:
-    ```
-      %0 = constant 0.0 : f32
-      %1 = vector.broadcast %0 : f32 to vector<4xf32>
-      vector.print %1 : vector<4xf32>
-
-      when lowered to LLVM, the vector print is unrolled into
-      elementary printing method calls that at runtime will yield
-
-      ( 0.0, 0.0, 0.0, 0.0 )
-
-      on stdout when linked with a small runtime support library,
-      which only needs to provide a few printing methods (single
-      value for all data types, opening/closing bracket, comma,
-      newline).
-    ```
-  }];
-  let verifier = ?;
-  let extraClassDeclaration = [{
-    Type getPrintType() {
-      return source().getType();
-    }
-  }];
-  let assemblyFormat = "$source attr-dict `:` type($source)";
-}
-
-//===----------------------------------------------------------------------===//
-// Ops used for supporting progressive lowering and conversion type changes.
-//===----------------------------------------------------------------------===//
-
-/// Vector dialect matrix multiplication op that operates on flattened 1-D
-/// MLIR vectors. This is the counterpart of llvm.matrix.multiply in MLIR.
-/// This may seem redundant with vector.contract but it serves the purposes of
-/// more progressive lowering and localized type conversion on the path:
-///   `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`.
-def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect,
-        PredOpTrait<"lhs operand and result have same element type",
-                    TCresVTEtIsSameAsOpBase<0, 0>>,
-        PredOpTrait<"rhs operand and result have same element type",
-                    TCresVTEtIsSameAsOpBase<0, 1>>]>,
-      Arguments<(
-        // TODO(ntv, fhahn): tighten vector element types that make sense.
-        ins VectorOfRankAndType<[1],
-              [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$lhs,
-            VectorOfRankAndType<[1],
-              [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$rhs,
-            I32Attr:$lhs_rows, I32Attr:$lhs_columns, I32Attr:$rhs_columns)>,
-      Results<(
-        outs VectorOfRankAndType<[1],
-               [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$res)>
-{
-  let summary = "Vector matrix multiplication op that operates on flattened 1-D"
-    " MLIR vectors";
-  let description = [{
-    This is the counterpart of llvm.matrix.multiply in MLIR. It serves the
-    purposes of more progressive lowering and localized type conversion.
-
-    The ‘vector.matrix_multiply’ op treats `lhs` as matrix with <lhs_rows> rows
-    and <lhs_columns> columns, `rhs` as matrix with <lhs_columns> rows and
-    <rhs_columns> and multiplies them. The result matrix is returned embedded in
-    the result vector.
-
-    Example:
-
-    ```
-      %C = vector.matrix_multiply %A, %B
-        { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } :
-        (vector<64xf64>, vector<48xf64>) -> vector<12xf64>
-    ```
-  }];
-  let builders = [
-   OpBuilder<"Builder *builder, OperationState &result, Value lhs, Value rhs, "
-             "unsigned lhsRows, unsigned lhsColumns, unsigned rhsColumns",
-   [{
-     result.addOperands({lhs, rhs});
-     result.addAttribute("lhs_rows", builder->getI32IntegerAttr(lhsRows));
-     result.addAttribute("lhs_columns", builder->getI32IntegerAttr(lhsColumns));
-     result.addAttribute("rhs_columns", builder->getI32IntegerAttr(rhsColumns));
-     result.addTypes(VectorType::get(lhsRows * lhsColumns,
-       lhs.getType().cast<VectorType>().getElementType()));
-   }]>,
-  ];
-  let verifier = ?;
-  let assemblyFormat = "$lhs `,` $rhs attr-dict "
-    "`:` `(` type($lhs) `,` type($rhs) `)` `->` type($res)";
-}
-
-#endif // VECTOR_OPS

diff  --git a/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
deleted file mode 100644
index e853c76d0dba..000000000000
--- a/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
+++ /dev/null
@@ -1,1349 +0,0 @@
-//===- VectorToLoops.cpp - Conversion within the Vector dialect -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements target-independent rewrites as 1->N patterns.
-//
-//===----------------------------------------------------------------------===//
-
-#include <type_traits>
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/VectorOps/VectorOps.h"
-#include "mlir/Dialect/VectorOps/VectorTransforms.h"
-#include "mlir/Dialect/VectorOps/VectorUtils.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Support/Functional.h"
-#include "mlir/Support/STLExtras.h"
-
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "vector-to-vector"
-
-using namespace mlir;
-using llvm::dbgs;
-using mlir::functional::zipMap;
-
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-
-static llvm::cl::opt<bool> lowerToLLVMMatrixIntrinsics(
-    "vector-lower-matrix-intrinsics",
-    llvm::cl::desc("Lower vector.contract to llvm.intr.matrix.multiply"),
-    llvm::cl::init(false), llvm::cl::cat(clOptionsCategory));
-
-/// Given a shape with sizes greater than 0 along all dimensions,
-/// returns the distance, in number of elements, between a slice in a dimension
-/// and the next slice in the same dimension.
-///   e.g. shape[3, 4, 5] -> linearization_basis[20, 5, 1]
-static SmallVector<int64_t, 8> computeStrides(ArrayRef<int64_t> shape) {
-  if (shape.empty())
-    return {};
-  SmallVector<int64_t, 8> tmp;
-  tmp.reserve(shape.size());
-  int64_t running = 1;
-  for (auto size : llvm::reverse(shape)) {
-    assert(size > 0 && "size must be nonnegative");
-    tmp.push_back(running);
-    running *= size;
-  }
-  return SmallVector<int64_t, 8>(tmp.rbegin(), tmp.rend());
-}
-
-static int64_t computeMaxLinearIndex(ArrayRef<int64_t> basis) {
-  if (basis.empty())
-    return 0;
-  int64_t res = 1;
-  for (auto b : basis)
-    res *= b;
-  return res;
-}
-
-/// Computes and returns the linearized index of 'offsets' w.r.t. 'basis'.
-static int64_t linearize(ArrayRef<int64_t> offsets, ArrayRef<int64_t> basis) {
-  assert(offsets.size() == basis.size());
-  int64_t linearIndex = 0;
-  for (unsigned idx = 0, e = basis.size(); idx < e; ++idx)
-    linearIndex += offsets[idx] * basis[idx];
-  return linearIndex;
-}
-
-// Clones `op` into a new operations that takes `operands` and returns
-// `resultTypes`.
-static Operation *cloneOpWithOperandsAndTypes(PatternRewriter &builder,
-                                              Location loc, Operation *op,
-                                              ArrayRef<Value> operands,
-                                              ArrayRef<Type> resultTypes) {
-  OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,
-                     op->getAttrs());
-  return builder.createOperation(res);
-}
-
-// Populates 'resultElements[indexMap[i]]' with elements from 'inputElements[i]'
-// for each index 'i' in inputElements with a valid mapping in 'indexMap'.
-static void getMappedElements(const DenseMap<int64_t, int64_t> &indexMap,
-                              ArrayRef<int64_t> inputElements,
-                              SmallVectorImpl<int64_t> &resultElements) {
-  assert(indexMap.size() == resultElements.size());
-  assert(inputElements.size() >= resultElements.size());
-  for (unsigned i = 0, e = inputElements.size(); i < e; ++i) {
-    auto it = indexMap.find(i);
-    if (it != indexMap.end())
-      resultElements[it->second] = inputElements[i];
-  }
-}
-
-// Returns a tuple type with vector element types for each resulting slice
-// of 'vectorType' unrolled by 'sizes' and 'strides'.
-// TODO(andydavis) Move this to a utility function and share it with
-// Extract/InsertSlicesOp verification.
-static TupleType generateExtractSlicesOpResultType(VectorType vectorType,
-                                                   ArrayRef<int64_t> sizes,
-                                                   ArrayRef<int64_t> strides,
-                                                   PatternRewriter &builder) {
-  assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
-  assert(static_cast<int64_t>(sizes.size()) == vectorType.getRank());
-  assert(static_cast<int64_t>(strides.size()) == vectorType.getRank());
-
-  // Compute shape ratio of 'shape' and 'sizes'.
-  auto shape = vectorType.getShape();
-  auto maybeDimSliceCounts = shapeRatio(shape, sizes);
-  assert(maybeDimSliceCounts.hasValue());
-  auto sliceDimCounts = *maybeDimSliceCounts;
-
-  // Compute strides w.r.t number of slices in each dimension.
-  auto sliceStrides = computeStrides(sliceDimCounts);
-  int64_t sliceCount = computeMaxLinearIndex(sliceDimCounts);
-  SmallVector<Type, 4> vectorTypes(sliceCount);
-  for (unsigned i = 0; i < sliceCount; ++i) {
-    auto vectorOffsets = delinearize(sliceStrides, i);
-    auto elementOffsets =
-        computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
-    auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets);
-    // Create Vector type and add to 'vectorTypes[i]'.
-    vectorTypes[i] = VectorType::get(sliceSizes, vectorType.getElementType());
-  }
-  return TupleType::get(vectorTypes, builder.getContext());
-}
-
-// UnrolledVectorState aggregates per-operand/result vector state required for
-// unrolling.
-struct UnrolledVectorState {
-  SmallVector<int64_t, 4> unrolledShape;
-  SmallVector<int64_t, 4> unrollFactors;
-  SmallVector<int64_t, 8> basis;
-  int64_t numInstances;
-  Value slicesTuple;
-};
-
-// Populates 'state' with unrolled shape, unroll factors, basis and
-// num unrolled instances for 'vectorType'.
-static void initUnrolledVectorState(VectorType vectorType, Value initValue,
-                                    const DenseMap<int64_t, int64_t> &indexMap,
-                                    ArrayRef<int64_t> targetShape,
-                                    UnrolledVectorState &state,
-                                    PatternRewriter &builder) {
-  // Compute unrolled shape of 'vectorType'.
-  state.unrolledShape.resize(vectorType.getRank());
-  getMappedElements(indexMap, targetShape, state.unrolledShape);
-  // Compute unroll factors for unrolled shape.
-  auto maybeUnrollFactors =
-      shapeRatio(vectorType.getShape(), state.unrolledShape);
-  assert(maybeUnrollFactors.hasValue());
-  state.unrollFactors = *maybeUnrollFactors;
-  // Compute 'basis' and 'numInstances' based on 'state.unrollFactors'.
-  state.basis = computeStrides(state.unrollFactors);
-  state.numInstances = computeMaxLinearIndex(state.unrollFactors);
-  state.slicesTuple = nullptr;
-  if (initValue != nullptr) {
-    // Create ExtractSlicesOp.
-    SmallVector<int64_t, 4> sizes(state.unrolledShape);
-    SmallVector<int64_t, 4> strides(state.unrollFactors.size(), 1);
-    auto tupleType =
-        generateExtractSlicesOpResultType(vectorType, sizes, strides, builder);
-    state.slicesTuple = builder.create<vector::ExtractSlicesOp>(
-        initValue.getLoc(), tupleType, initValue, sizes, strides);
-  }
-}
-
-// Computes and returns the linear index of the unrolled vector at
-// 'vectorOffsets' within the vector represented by 'state'.
-static int64_t
-getUnrolledVectorLinearIndex(UnrolledVectorState &state,
-                             ArrayRef<int64_t> vectorOffsets,
-                             DenseMap<int64_t, int64_t> &indexMap) {
-  // Compute vector offsets.
-  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
-  getMappedElements(indexMap, vectorOffsets, sliceOffsets);
-  // Compute and return linear index of 'sliceOffsets' w.r.t 'state.basis'.
-  return linearize(sliceOffsets, state.basis);
-}
-
-// Returns an unrolled vector at 'vectorOffsets' within the vector
-// represented by 'state'. The vector is created from a slice of 'initValue'
-// if not present in 'cache'.
-static Value getOrCreateUnrolledVectorSlice(
-    Location loc, UnrolledVectorState &state, ArrayRef<int64_t> vectorOffsets,
-    ArrayRef<int64_t> offsets, DenseMap<int64_t, int64_t> &indexMap,
-    Value initValue, SmallVectorImpl<Value> &cache, PatternRewriter &builder) {
-  // Compute slice offsets.
-  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
-  getMappedElements(indexMap, offsets, sliceOffsets);
-  // TODO(b/144845578) Support non-1 strides.
-  SmallVector<int64_t, 4> sliceStrides(state.unrolledShape.size(), 1);
-  // Compute linear index of 'sliceOffsets' w.r.t 'state.basis'.
-  int64_t sliceLinearIndex =
-      getUnrolledVectorLinearIndex(state, vectorOffsets, indexMap);
-  assert(sliceLinearIndex < static_cast<int64_t>(cache.size()));
-  auto valueSlice = cache[sliceLinearIndex];
-  if (valueSlice == nullptr) {
-    // Return tuple element at 'sliceLinearIndex'.
-    auto tupleIndex = builder.getI64IntegerAttr(sliceLinearIndex);
-    auto initValueType = initValue.getType().cast<VectorType>();
-    auto vectorType =
-        VectorType::get(state.unrolledShape, initValueType.getElementType());
-    // Initialize 'cache' with slice from 'initValue'.
-    valueSlice = builder.create<vector::TupleGetOp>(
-        loc, vectorType, state.slicesTuple, tupleIndex);
-    // Store value back to 'cache'.
-    cache[sliceLinearIndex] = valueSlice;
-  }
-  return valueSlice;
-}
-
-// VectorState aggregates per-operand/result vector state required for
-// creating slices of vector operands, and clones of the operation being
-// unrolled.
-struct VectorState {
-  // The type of this vector.
-  VectorType type;
-  // Map from iteration space index to vector dimension index.
-  DenseMap<int64_t, int64_t> indexMap;
-  // Index of this value in operation's operand list (-1 if not an operand).
-  int64_t operandIndex = -1;
-  // Accumulator iterator flag.
-  bool isAcc = false;
-};
-
-//
-// unrollSingleResultStructuredOp
-//
-// Returns a value representing the result of structured operation 'op'
-// with iteration bounds 'iterationBounds' unrolled to 'targetShape'.
-// A list of VectorState objects must be specified in 'vectors', where
-// each VectorState in the list represents a vector operand or vector result
-// (if the operation does not have an accumulator operand).
-// The VectorState at index 'resultIndex' in the list must be the state
-// associated with the operations single result (i.e. either its accumulator
-// operand or vector result value).
-//
-// Example:
-//
-//  // Before unrolling
-//
-//   operand0                operand1                operand2
-//       \                      |                      /
-//        -------------------- opA --------------------
-//
-//  // After unrolling by 2
-//
-//   operand0                operand1                operand2
-//   /      \                /      \                /      \
-// slice00  slice01       slice10  slice11        slice20  slice21
-//   \         |            |          |            /          |
-//    -------------------- opA0 --------------------           |
-//             |            |          |                       |
-//              \           |          |                      /
-//               -------------------- opA1 -------------------
-//                          |          |
-//                           \        /
-//                           insertslice
-//                                |
-
-// TODO(andydavis) Add the following canonicalization/simplifcation patterns:
-// *) Add pattern which matches InsertStridedSlice -> StridedSlice and forwards
-//    InsertStridedSlice operand to StridedSlice.
-// *) Add pattern which matches SourceOp -> StridedSlice -> UserOp which checks
-//    if there are duplicate identical StridedSlice ops from SourceOp, and
-//    rewrites itself to use the first duplicate. This transformation should
-//    cause users of identifical StridedSlice ops to reuse the same StridedSlice
-//    operation, and leave the duplicate StridedSlice ops with no users
-//    (removable with DCE).
-
-// TODO(andydavis) Generalize this to support structured ops beyond
-// vector ContractionOp, and merge it with 'unrollSingleResultOpMatchingType'
-static Value unrollSingleResultStructuredOp(Operation *op,
-                                            ArrayRef<int64_t> iterationBounds,
-                                            std::vector<VectorState> &vectors,
-                                            unsigned resultIndex,
-                                            ArrayRef<int64_t> targetShape,
-                                            PatternRewriter &builder) {
-  auto shapedType = op->getResult(0).getType().dyn_cast_or_null<ShapedType>();
-  if (!shapedType || !shapedType.hasStaticShape())
-    assert(false && "Expected a statically shaped result type");
-
-  // Compute unroll factors for 'iterationBounds' based on 'targetShape'
-  auto maybeUnrollFactors = shapeRatio(iterationBounds, targetShape);
-  if (!maybeUnrollFactors.hasValue())
-    assert(false && "Failed to compute unroll factors for target shape");
-  auto unrollFactors = *maybeUnrollFactors;
-
-  // Compute unrolled vector state for each vector in 'vectors'.
-  unsigned numVectors = vectors.size();
-  SmallVector<UnrolledVectorState, 3> unrolledVectorState(numVectors);
-  for (unsigned i = 0; i < numVectors; ++i) {
-    int64_t operandIndex = vectors[i].operandIndex;
-    auto operand = operandIndex >= 0 ? op->getOperand(operandIndex) : nullptr;
-    initUnrolledVectorState(vectors[i].type, operand, vectors[i].indexMap,
-                            targetShape, unrolledVectorState[i], builder);
-  }
-  // Compute number of total unrolled instances.
-  auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors);
-  auto sliceStrides = computeStrides(unrollFactors);
-
-  auto &resultValueState = unrolledVectorState[resultIndex];
-  auto unrolledResultType = VectorType::get(resultValueState.unrolledShape,
-                                            shapedType.getElementType());
-
-  // Initialize caches for intermediate vector results.
-  std::vector<SmallVector<Value, 4>> caches(numVectors);
-  for (unsigned i = 0; i < numVectors; ++i)
-    caches[i].resize(unrolledVectorState[i].numInstances);
-
-  // Unroll 'numUnrolledInstances' of 'op', storing results in 'caches'.
-  for (unsigned i = 0; i < numUnrolledInstances; ++i) {
-    auto vectorOffsets = delinearize(sliceStrides, i);
-    auto elementOffsets =
-        computeElementOffsetsFromVectorSliceOffsets(targetShape, vectorOffsets);
-    // Get cached slice (or create slice) for each operand at 'offsets'.
-    SmallVector<Value, 3> operands;
-    operands.resize(op->getNumOperands());
-    for (unsigned i = 0; i < numVectors; ++i) {
-      int64_t operandIndex = vectors[i].operandIndex;
-      if (operandIndex < 0)
-        continue; // Output
-      auto operand = op->getOperand(operandIndex);
-      operands[operandIndex] = getOrCreateUnrolledVectorSlice(
-          op->getLoc(), unrolledVectorState[i], vectorOffsets, elementOffsets,
-          vectors[i].indexMap, operand, caches[i], builder);
-    }
-    // Create op on sliced vector arguments.
-    auto resultVector =
-        cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands,
-                                    unrolledResultType)
-            ->getResult(0);
-
-    // Compute linear result index.
-    int64_t linearIndex = getUnrolledVectorLinearIndex(
-        resultValueState, vectorOffsets, vectors[resultIndex].indexMap);
-    // Update result cache at 'linearIndex'.
-    caches[resultIndex][linearIndex] = resultVector;
-  }
-
-  // Create TupleOp of unrolled result vectors.
-  SmallVector<Type, 4> vectorTupleTypes(resultValueState.numInstances);
-  SmallVector<Value, 4> vectorTupleValues(resultValueState.numInstances);
-  for (unsigned i = 0; i < resultValueState.numInstances; ++i) {
-    vectorTupleTypes[i] = caches[resultIndex][i].getType().cast<VectorType>();
-    vectorTupleValues[i] = caches[resultIndex][i];
-  }
-  TupleType tupleType = builder.getTupleType(vectorTupleTypes);
-  Value tupleOp = builder.create<vector::TupleOp>(op->getLoc(), tupleType,
-                                                  vectorTupleValues);
-
-  // Create InsertSlicesOp(Tuple(result_vectors)).
-  auto resultVectorType = op->getResult(0).getType().cast<VectorType>();
-  SmallVector<int64_t, 4> sizes(resultValueState.unrolledShape);
-  SmallVector<int64_t, 4> strides(resultValueState.unrollFactors.size(), 1);
-
-  Value insertSlicesOp = builder.create<vector::InsertSlicesOp>(
-      op->getLoc(), resultVectorType, tupleOp, builder.getI64ArrayAttr(sizes),
-      builder.getI64ArrayAttr(strides));
-  return insertSlicesOp;
-}
-
-static void getVectorContractionOpUnrollState(
-    vector::ContractionOp contractionOp, ArrayRef<int64_t> targetShape,
-    SmallVectorImpl<int64_t> &iterationBounds,
-    std::vector<VectorState> &vectors, unsigned &resultIndex) {
-  // Get contraction op iteration bounds.
-  contractionOp.getIterationBounds(iterationBounds);
-  assert(iterationBounds.size() == targetShape.size());
-  // Get map from iteration space index to lhs/rhs/result shape index.
-  std::vector<DenseMap<int64_t, int64_t>> iterationIndexMapList;
-  contractionOp.getIterationIndexMap(iterationIndexMapList);
-  unsigned numIterators = iterationIndexMapList.size();
-  vectors.resize(numIterators);
-  unsigned accOperandIndex = vector::ContractionOp::getAccOperandIndex();
-  for (unsigned i = 0; i < numIterators; ++i) {
-    vectors[i].type = contractionOp.getOperand(i).getType().cast<VectorType>();
-    vectors[i].indexMap = iterationIndexMapList[i];
-    vectors[i].operandIndex = i;
-    vectors[i].isAcc = i == accOperandIndex ? true : false;
-  }
-
-  if (llvm::size(contractionOp.masks()) == 2) {
-    // Add vectors for lhs/rhs vector mask arguments. Masks have the
-    // same vector shape lhs/rhs args, so copy their index maps.
-    vectors.push_back({contractionOp.getLHSVectorMaskType(),
-                       vectors[0].indexMap, accOperandIndex + 1, false});
-    vectors.push_back({contractionOp.getRHSVectorMaskType(),
-                       vectors[1].indexMap, accOperandIndex + 2, false});
-  }
-  // Unroll 'op' 'iterationBounds' to 'targetShape'.
-  // TODO(andydavis) Use linalg style 'args_in'/'args_out' to partition
-  // 'vectors' instead of 'resultIndex'.
-  resultIndex = accOperandIndex;
-}
-
-static void
-getVectorElementwiseOpUnrollState(Operation *op, ArrayRef<int64_t> targetShape,
-                                  SmallVectorImpl<int64_t> &iterationBounds,
-                                  std::vector<VectorState> &vectors,
-                                  unsigned &resultIndex) {
-  // Verify that operation and operands all have the same vector shape.
-  auto resultType = op->getResult(0).getType().dyn_cast_or_null<VectorType>();
-  assert(resultType && "Expected op with vector result type");
-  auto resultShape = resultType.getShape();
-  // Verify that all operands have the same vector type as result.
-  assert(llvm::all_of(op->getOperandTypes(),
-                      [=](Type type) { return type == resultType; }));
-  // Populate 'iterationBounds' with 'resultShape' for elementwise operations.
-  iterationBounds.assign(resultShape.begin(), resultShape.end());
-
-  // Create trivial elementwise identity index map based on 'resultShape'.
-  DenseMap<int64_t, int64_t> indexMap;
-  indexMap.reserve(resultShape.size());
-  for (unsigned i = 0; i < resultShape.size(); ++i)
-    indexMap[i] = i;
-
-  // Create VectorState each operand and single result.
-  unsigned numVectors = op->getNumOperands() + op->getNumResults();
-  vectors.resize(numVectors);
-  for (unsigned i = 0; i < op->getNumOperands(); ++i)
-    vectors[i] = {resultType, indexMap, i, false};
-  vectors[numVectors - 1] = {resultType, indexMap, -1, false};
-  resultIndex = numVectors - 1;
-}
-
-// Entry point for unrolling declarative pattern rewrites.
-SmallVector<Value, 1> mlir::vector::unrollSingleResultOpMatchingType(
-    PatternRewriter &builder, Operation *op, ArrayRef<int64_t> targetShape) {
-  assert(op->getNumResults() == 1 && "Expected single result operation");
-
-  // Populate 'iterationBounds', 'vectors' and 'resultIndex' to unroll 'op'.
-  SmallVector<int64_t, 6> iterationBounds;
-  std::vector<VectorState> vectors;
-  unsigned resultIndex;
-
-  if (auto contractionOp = dyn_cast<vector::ContractionOp>(op)) {
-    // Populate state for vector ContractionOp.
-    getVectorContractionOpUnrollState(contractionOp, targetShape,
-                                      iterationBounds, vectors, resultIndex);
-  } else {
-    // Populate state for vector elementwise op.
-    getVectorElementwiseOpUnrollState(op, targetShape, iterationBounds, vectors,
-                                      resultIndex);
-  }
-
-  // Unroll 'op' with 'iterationBounds' to 'targetShape'.
-  return SmallVector<Value, 1>{unrollSingleResultStructuredOp(
-      op, iterationBounds, vectors, resultIndex, targetShape, builder)};
-}
-
-/// Generates slices of 'vectorType' according to 'sizes' and 'strides, and
-/// calls 'fn' with linear index and indices for each slice.
-static void
-generateTransferOpSlices(Type memrefElementType, VectorType vectorType,
-                         TupleType tupleType, ArrayRef<int64_t> sizes,
-                         ArrayRef<int64_t> strides, ArrayRef<Value> indices,
-                         PatternRewriter &rewriter,
-                         function_ref<void(unsigned, ArrayRef<Value>)> fn) {
-  // Compute strides w.r.t. to slice counts in each dimension.
-  auto maybeDimSliceCounts = shapeRatio(vectorType.getShape(), sizes);
-  assert(maybeDimSliceCounts.hasValue());
-  auto sliceDimCounts = *maybeDimSliceCounts;
-  auto sliceStrides = computeStrides(sliceDimCounts);
-
-  int64_t numSlices = tupleType.size();
-  unsigned numSliceIndices = indices.size();
-  // Compute 'indexOffset' at which to update 'indices', which is equal
-  // to the memref rank (indices.size) minus the effective 'vectorRank'.
-  // The effective 'vectorRank', is equal to the rank of the vector type
-  // minus the rank of the memref vector element type (if it has one).
-  //
-  // For example:
-  //
-  //   Given memref type 'memref<6x2x1xvector<2x4xf32>>' and vector
-  //   transfer_read/write ops which read/write vectors of type
-  //   'vector<2x1x2x4xf32>'. The memref rank is 3, and the effective
-  //   vector rank is 4 - 2 = 2, and so 'indexOffset' = 3 - 2 = 1.
-  //
-  unsigned vectorRank = vectorType.getRank();
-  if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
-    assert(vectorRank >= memrefVectorElementType.getRank());
-    vectorRank -= memrefVectorElementType.getRank();
-  }
-  unsigned indexOffset = numSliceIndices - vectorRank;
-
-  auto *ctx = rewriter.getContext();
-  for (unsigned i = 0; i < numSlices; ++i) {
-    auto vectorOffsets = delinearize(sliceStrides, i);
-    auto elementOffsets =
-        computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
-    // Compute 'sliceIndices' by adding 'sliceOffsets[i]' to 'indices[i]'.
-    SmallVector<Value, 4> sliceIndices(numSliceIndices);
-    for (unsigned j = 0; j < numSliceIndices; ++j) {
-      if (j < indexOffset) {
-        sliceIndices[j] = indices[j];
-      } else {
-        auto expr = getAffineDimExpr(0, ctx) +
-                    getAffineConstantExpr(elementOffsets[j - indexOffset], ctx);
-        auto map = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, expr);
-        sliceIndices[j] = rewriter.create<AffineApplyOp>(
-            indices[j].getLoc(), map, ArrayRef<Value>(indices[j]));
-      }
-    }
-    // Call 'fn' to generate slice 'i' at 'sliceIndices'.
-    fn(i, sliceIndices);
-  }
-}
-
-/// Returns true if 'map' is a suffix of an identity affine map, false
-/// otherwise. Example: affine_map<(d0, d1, d2, d3) -> (d2, d3)>
-static bool isIdentitySuffix(AffineMap map) {
-  if (map.getNumDims() < map.getNumResults())
-    return false;
-  ArrayRef<AffineExpr> results = map.getResults();
-  Optional<int> lastPos;
-  for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) {
-    auto expr = results[i].dyn_cast<AffineDimExpr>();
-    if (!expr)
-      return false;
-    int currPos = static_cast<int>(expr.getPosition());
-    if (lastPos.hasValue() && currPos != lastPos.getValue() + 1)
-      return false;
-    lastPos = currPos;
-  }
-  return true;
-}
-
-namespace {
-
-// Splits vector TransferReadOp into smaller TransferReadOps based on slicing
-// scheme of its unique ExtractSlicesOp user.
-struct SplitTransferReadOp : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::TransferReadOp xferReadOp,
-                                     PatternRewriter &rewriter) const override {
-    // TODO(andydavis, ntv) Support splitting TransferReadOp with non-identity
-    // permutation maps. Repurpose code from MaterializeVectors transformation.
-    if (!isIdentitySuffix(xferReadOp.permutation_map()))
-      return matchFailure();
-    // Return unless the unique 'xferReadOp' user is an ExtractSlicesOp.
-    Value xferReadResult = xferReadOp.getResult();
-    auto extractSlicesOp =
-        dyn_cast<vector::ExtractSlicesOp>(*xferReadResult.getUsers().begin());
-    if (!xferReadResult.hasOneUse() || !extractSlicesOp)
-      return matchFailure();
-
-    // Get 'sizes' and 'strides' parameters from ExtractSlicesOp user.
-    auto sourceVectorType = extractSlicesOp.getSourceVectorType();
-    auto resultTupleType = extractSlicesOp.getResultTupleType();
-    SmallVector<int64_t, 4> sizes;
-    extractSlicesOp.getSizes(sizes);
-    SmallVector<int64_t, 4> strides;
-    extractSlicesOp.getStrides(strides);
-    assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
-
-    Location loc = xferReadOp.getLoc();
-    auto memrefElementType =
-        xferReadOp.memref().getType().cast<MemRefType>().getElementType();
-    int64_t numSlices = resultTupleType.size();
-    SmallVector<Value, 4> vectorTupleValues(numSlices);
-    SmallVector<Value, 4> indices(xferReadOp.indices().begin(),
-                                  xferReadOp.indices().end());
-    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
-      // Get VectorType for slice 'i'.
-      auto sliceVectorType = resultTupleType.getType(index);
-      // Create split TransferReadOp for 'sliceUser'.
-      vectorTupleValues[index] = rewriter.create<vector::TransferReadOp>(
-          loc, sliceVectorType, xferReadOp.memref(), sliceIndices,
-          xferReadOp.permutation_map(), xferReadOp.padding());
-    };
-    generateTransferOpSlices(memrefElementType, sourceVectorType,
-                             resultTupleType, sizes, strides, indices, rewriter,
-                             createSlice);
-
-    // Create tuple of splice xfer read operations.
-    Value tupleOp = rewriter.create<vector::TupleOp>(loc, resultTupleType,
-                                                     vectorTupleValues);
-    // Replace 'xferReadOp' with result 'insertSlicesResult'.
-    rewriter.replaceOpWithNewOp<vector::InsertSlicesOp>(
-        xferReadOp, sourceVectorType, tupleOp, extractSlicesOp.sizes(),
-        extractSlicesOp.strides());
-    return matchSuccess();
-  }
-};
-
-// Splits vector TransferWriteOp into smaller TransferWriteOps for each source.
-struct SplitTransferWriteOp : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::TransferWriteOp xferWriteOp,
-                                     PatternRewriter &rewriter) const override {
-    // TODO(andydavis, ntv) Support splitting TransferWriteOp with non-identity
-    // permutation maps. Repurpose code from MaterializeVectors transformation.
-    if (!isIdentitySuffix(xferWriteOp.permutation_map()))
-      return matchFailure();
-    // Return unless the 'xferWriteOp' 'vector' operand is an 'InsertSlicesOp'.
-    auto *vectorDefOp = xferWriteOp.vector().getDefiningOp();
-    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(vectorDefOp);
-    if (!insertSlicesOp)
-      return matchFailure();
-
-    // Get TupleOp operand of 'insertSlicesOp'.
-    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
-        insertSlicesOp.vectors().getDefiningOp());
-    if (!tupleOp)
-      return matchFailure();
-
-    // Get 'sizes' and 'strides' parameters from InsertSlicesOp user.
-    auto sourceTupleType = insertSlicesOp.getSourceTupleType();
-    auto resultVectorType = insertSlicesOp.getResultVectorType();
-    SmallVector<int64_t, 4> sizes;
-    insertSlicesOp.getSizes(sizes);
-    SmallVector<int64_t, 4> strides;
-    insertSlicesOp.getStrides(strides);
-
-    Location loc = xferWriteOp.getLoc();
-    auto memrefElementType =
-        xferWriteOp.memref().getType().cast<MemRefType>().getElementType();
-    SmallVector<Value, 4> indices(xferWriteOp.indices().begin(),
-                                  xferWriteOp.indices().end());
-    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
-      // Create split TransferWriteOp for source vector 'tupleOp.operand[i]'.
-      rewriter.create<vector::TransferWriteOp>(
-          loc, tupleOp.getOperand(index), xferWriteOp.memref(), sliceIndices,
-          xferWriteOp.permutation_map());
-    };
-    generateTransferOpSlices(memrefElementType, resultVectorType,
-                             sourceTupleType, sizes, strides, indices, rewriter,
-                             createSlice);
-
-    // Erase old 'xferWriteOp'.
-    rewriter.eraseOp(xferWriteOp);
-    return matchSuccess();
-  }
-};
-
-/// Decomposes ShapeCastOp on tuple-of-vectors to multiple ShapeCastOps, each
-/// on vector types.
-struct ShapeCastOpDecomposer : public OpRewritePattern<vector::ShapeCastOp> {
-  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
-                                     PatternRewriter &rewriter) const override {
-    // Check if 'shapeCastOp' has tuple source/result type.
-    auto sourceTupleType =
-        shapeCastOp.source().getType().dyn_cast_or_null<TupleType>();
-    auto resultTupleType =
-        shapeCastOp.result().getType().dyn_cast_or_null<TupleType>();
-    if (!sourceTupleType || !resultTupleType)
-      return matchFailure();
-    assert(sourceTupleType.size() == resultTupleType.size());
-
-    // Create single-vector ShapeCastOp for each source tuple element.
-    Location loc = shapeCastOp.getLoc();
-    SmallVector<Value, 8> resultElements;
-    resultElements.reserve(resultTupleType.size());
-    for (unsigned i = 0, e = sourceTupleType.size(); i < e; ++i) {
-      auto sourceElement = rewriter.create<vector::TupleGetOp>(
-          loc, sourceTupleType.getType(i), shapeCastOp.source(),
-          rewriter.getI64IntegerAttr(i));
-      resultElements.push_back(rewriter.create<vector::ShapeCastOp>(
-          loc, resultTupleType.getType(i), sourceElement));
-    }
-
-    // Replace 'shapeCastOp' with tuple of 'resultElements'.
-    rewriter.replaceOpWithNewOp<vector::TupleOp>(shapeCastOp, resultTupleType,
-                                                 resultElements);
-    return matchSuccess();
-  }
-};
-
-/// ShapeCastOpFolder folds cancelling ShapeCastOps away.
-//
-// Example:
-//
-//  The following MLIR with cancelling ShapeCastOps:
-//
-//   %0 = source : vector<5x4x2xf32>
-//   %1 = shape_cast %0 : vector<5x4x2xf32> to vector<20x2xf32>
-//   %2 = shape_cast %1 : vector<20x2xf32> to vector<5x4x2xf32>
-//   %3 = user %2 : vector<5x4x2xf32>
-//
-//  Should canonicalize to the following:
-//
-//   %0 = source : vector<5x4x2xf32>
-//   %1 = user %0 : vector<5x4x2xf32>
-//
-struct ShapeCastOpFolder : public OpRewritePattern<vector::ShapeCastOp> {
-  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
-                                     PatternRewriter &rewriter) const override {
-    // Check if 'shapeCastOp' has vector source/result type.
-    auto sourceVectorType =
-        shapeCastOp.source().getType().dyn_cast_or_null<VectorType>();
-    auto resultVectorType =
-        shapeCastOp.result().getType().dyn_cast_or_null<VectorType>();
-    if (!sourceVectorType || !resultVectorType)
-      return matchFailure();
-
-    // Check if shape cast op source operand is also a shape cast op.
-    auto sourceShapeCastOp = dyn_cast_or_null<vector::ShapeCastOp>(
-        shapeCastOp.source().getDefiningOp());
-    if (!sourceShapeCastOp)
-      return matchFailure();
-    auto operandSourceVectorType =
-        sourceShapeCastOp.source().getType().cast<VectorType>();
-    auto operandResultVectorType =
-        sourceShapeCastOp.result().getType().cast<VectorType>();
-
-    // Check if shape cast operations invert each other.
-    if (operandSourceVectorType != resultVectorType ||
-        operandResultVectorType != sourceVectorType)
-      return matchFailure();
-
-    rewriter.replaceOp(shapeCastOp, sourceShapeCastOp.source());
-    return matchSuccess();
-  }
-};
-
-// Patter rewrite which forward tuple elements to their users.
-// User(TupleGetOp(ExtractSlicesOp(InsertSlicesOp(TupleOp(Producer)))))
-//   -> User(Producer)
-struct TupleGetFolderOp : public OpRewritePattern<vector::TupleGetOp> {
-  using OpRewritePattern<vector::TupleGetOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::TupleGetOp tupleGetOp,
-                                     PatternRewriter &rewriter) const override {
-    // Return if 'tupleGetOp.vectors' arg was not defined by ExtractSlicesOp.
-    auto extractSlicesOp = dyn_cast_or_null<vector::ExtractSlicesOp>(
-        tupleGetOp.vectors().getDefiningOp());
-    if (!extractSlicesOp)
-      return matchFailure();
-
-    // Return if 'extractSlicesOp.vector' arg was not defined by InsertSlicesOp.
-    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(
-        extractSlicesOp.vector().getDefiningOp());
-    if (!insertSlicesOp)
-      return matchFailure();
-
-    // Return if 'insertSlicesOp.vectors' arg was not defined by TupleOp.
-    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
-        insertSlicesOp.vectors().getDefiningOp());
-    if (!tupleOp)
-      return matchFailure();
-
-    // Forward Value from 'tupleOp' at 'tupleGetOp.index'.
-    Value tupleValue = tupleOp.getOperand(tupleGetOp.getIndex());
-    rewriter.replaceOp(tupleGetOp, tupleValue);
-    return matchSuccess();
-  }
-};
-
-/// Progressive lowering of ExtractSlicesOp to tuple of StridedSliceOp.
-/// One:
-///   %x = vector.extract_slices %0
-/// is replaced by:
-///   %a = vector.strided_slice %0
-///   %b = vector.strided_slice %0
-///   ..
-///   %x = vector.tuple %a, %b, ..
-class ExtractSlicesOpLowering
-    : public OpRewritePattern<vector::ExtractSlicesOp> {
-public:
-  using OpRewritePattern<vector::ExtractSlicesOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ExtractSlicesOp op,
-                                     PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    VectorType vectorType = op.getSourceVectorType();
-    auto shape = vectorType.getShape();
-
-    SmallVector<int64_t, 4> sizes;
-    op.getSizes(sizes);
-    SmallVector<int64_t, 4> strides;
-    op.getStrides(strides); // all-ones at the moment
-
-    // For each element in the tuple, generate the proper strided slice.
-    TupleType tupleType = op.getResultTupleType();
-    int64_t tupleSize = tupleType.size();
-    SmallVector<Value, 4> tupleValues(tupleSize);
-    auto sliceStrides = computeStrides(shape, sizes);
-    for (int64_t i = 0; i < tupleSize; ++i) {
-      auto vectorOffsets = delinearize(sliceStrides, i);
-      auto elementOffsets =
-          computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
-      auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets);
-      // Insert in tuple.
-      tupleValues[i] = rewriter.create<vector::StridedSliceOp>(
-          loc, op.vector(), elementOffsets, sliceSizes, strides);
-    }
-
-    rewriter.replaceOpWithNewOp<vector::TupleOp>(op, tupleType, tupleValues);
-    return matchSuccess();
-  }
-};
-
-/// Progressive lowering of InsertSlicesOp to series of InsertStridedSliceOp.
-/// One:
-///   %x = vector.insert_slices %0
-/// is replaced by:
-///   %r0 = vector.splat 0
-//    %t1 = vector.tuple_get %0, 0
-///   %r1 = vector.insert_strided_slice %r0, %t1
-//    %t2 = vector.tuple_get %0, 1
-///   %r2 = vector.insert_strided_slice %r1, %t2
-///   ..
-///   %x  = ..
-class InsertSlicesOpLowering : public OpRewritePattern<vector::InsertSlicesOp> {
-public:
-  using OpRewritePattern<vector::InsertSlicesOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::InsertSlicesOp op,
-                                     PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    VectorType vectorType = op.getResultVectorType();
-    auto shape = vectorType.getShape();
-
-    SmallVector<int64_t, 4> sizes;
-    op.getSizes(sizes);
-    SmallVector<int64_t, 4> strides;
-    op.getStrides(strides); // all-ones at the moment
-
-    // Prepare result.
-    auto elemType = vectorType.getElementType();
-    Value zero = rewriter.create<ConstantOp>(loc, elemType,
-                                             rewriter.getZeroAttr(elemType));
-    Value result = rewriter.create<SplatOp>(loc, vectorType, zero);
-
-    // For each element in the tuple, extract the proper strided slice.
-    TupleType tupleType = op.getSourceTupleType();
-    int64_t tupleSize = tupleType.size();
-    auto sliceStrides = computeStrides(shape, sizes);
-    for (int64_t i = 0; i < tupleSize; ++i) {
-      auto vectorOffsets = delinearize(sliceStrides, i);
-      auto elementOffsets =
-          computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
-      // Extract from tuple into the result.
-      auto index = rewriter.getI64IntegerAttr(i);
-      auto tupleGet = rewriter.create<vector::TupleGetOp>(
-          loc, tupleType.getType(i), op.getOperand(), index);
-      result = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, tupleGet, result, elementOffsets, strides);
-    }
-
-    rewriter.replaceOp(op, result);
-    return matchSuccess();
-  }
-};
-
-/// Progressive lowering of OuterProductOp.
-/// One:
-///   %x = vector.outerproduct %lhs, %rhs, %acc
-/// is replaced by:
-///   %z = zero-result
-///   %0 = vector.extract %lhs[0]
-///   %1 = vector.broadcast %0
-///   %2 = vector.extract %acc[0]
-///   %3 = vector.fma %1, %arg1, %2
-///   %4 = vector.insert %3, %z[0]
-///   ..
-///   %x = vector.insert %.., %..[N-1]
-///
-class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
-public:
-  using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::OuterProductOp op,
-                                     PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    VectorType rhsType = op.getOperandVectorTypeRHS();
-    VectorType resType = op.getVectorType();
-    Type eltType = resType.getElementType();
-    Value acc = (op.acc().empty()) ? nullptr : op.acc()[0];
-
-    Value zero = rewriter.create<ConstantOp>(loc, eltType,
-                                             rewriter.getZeroAttr(eltType));
-    Value result = rewriter.create<SplatOp>(loc, resType, zero);
-    for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {
-      auto pos = rewriter.getI64ArrayAttr(d);
-      Value x = rewriter.create<vector::ExtractOp>(loc, eltType, op.lhs(), pos);
-      Value b = rewriter.create<vector::BroadcastOp>(loc, rhsType, x);
-      Value m;
-      if (acc) {
-        Value z = rewriter.create<vector::ExtractOp>(loc, rhsType, acc, pos);
-        m = rewriter.create<vector::FMAOp>(loc, b, op.rhs(), z);
-      } else {
-        m = rewriter.create<MulFOp>(loc, b, op.rhs());
-      }
-      result = rewriter.create<vector::InsertOp>(loc, resType, m, result, pos);
-    }
-    rewriter.replaceOp(op, result);
-    return matchSuccess();
-  }
-};
-
-/// Progressive lowering of ContractionOp.
-/// One:
-///   %x = vector.contract with at least one free/batch dimension
-/// is replaced by:
-///   %a = vector.contract with one less free/batch dimension
-///   %b = vector.contract with one less free/batch dimension
-///   ..
-///   %x = combine %a %b ..
-/// until a pure contraction is reached (no free/batch dimensions),
-/// which is replaced by a fma/reduction op.
-///
-/// TODO(ajcbik): break down into transpose/reshape/cast ops
-///               when they become available to avoid code dup
-/// TODO(ajcbik): investigate lowering order impact on performance
-class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ContractionOp op,
-                                     PatternRewriter &rewriter) const override {
-    // TODO(ajcbik): implement masks
-    if (llvm::size(op.masks()) != 0)
-      return matchFailure();
-
-    // TODO(ntv, ajcbik): implement benefits, cost models, separate this out in
-    // a new pattern.
-    // TODO(ntv, fhahn): once row-major mode is available in LLVM's matrix
-    // intrinsics, use that.
-    if (lowerToLLVMMatrixIntrinsics &&
-        isColumnMajorMatmul(op.indexing_maps())) {
-      VectorType lhsType = op.getLhsType();
-      VectorType rhsType = op.getRhsType();
-      Type flattenedLHSType =
-          VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
-      Type flattenedRHSType =
-          VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
-      auto lhs = rewriter.create<vector::ShapeCastOp>(
-          op.getLoc(), flattenedLHSType, op.lhs());
-      auto rhs = rewriter.create<vector::ShapeCastOp>(
-          op.getLoc(), flattenedRHSType, op.rhs());
-
-      unsigned lhsRows = op.getLhsType().getShape()[0];
-      unsigned lhsColumns = op.getLhsType().getShape()[1];
-      unsigned rhsColumns = op.getRhsType().getShape()[1];
-      Value mul = rewriter.create<vector::MatmulOp>(
-          op.getLoc(), lhs, rhs, lhsRows, lhsColumns, rhsColumns);
-      mul = rewriter.create<vector::ShapeCastOp>(op.getLoc(),
-                                                 op.acc().getType(), mul);
-      Type elementType = op.getLhsType().getElementType();
-      assert(elementType.isIntOrFloat());
-      if (elementType.isa<IntegerType>())
-        rewriter.replaceOpWithNewOp<AddIOp>(op, op.acc(), mul);
-      else
-        rewriter.replaceOpWithNewOp<AddFOp>(op, op.acc(), mul);
-      return matchSuccess();
-    }
-
-    // Find first batch dimension in LHS/RHS, and lower when found.
-    std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
-    if (!batchDimMap.empty()) {
-      int64_t lhsIndex = batchDimMap[0].first;
-      int64_t rhsIndex = batchDimMap[0].second;
-      rewriter.replaceOp(op, lowerParallel(op, lhsIndex, rhsIndex, rewriter));
-      return matchSuccess();
-    }
-
-    // Collect contracting dimensions.
-    std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
-        op.getContractingDimMap();
-    DenseSet<int64_t> lhsContractingDimSet;
-    DenseSet<int64_t> rhsContractingDimSet;
-    for (auto &dimPair : contractingDimMap) {
-      lhsContractingDimSet.insert(dimPair.first);
-      rhsContractingDimSet.insert(dimPair.second);
-    }
-
-    // Find first free dimension in LHS, and lower when found.
-    VectorType lhsType = op.getLhsType();
-    for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e;
-         ++lhsIndex) {
-      if (lhsContractingDimSet.count(lhsIndex) == 0) {
-        rewriter.replaceOp(
-            op, lowerParallel(op, lhsIndex, /*rhsIndex=*/-1, rewriter));
-        return matchSuccess();
-      }
-    }
-
-    // Find first free dimension in RHS, and lower when found.
-    VectorType rhsType = op.getRhsType();
-    for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e;
-         ++rhsIndex) {
-      if (rhsContractingDimSet.count(rhsIndex) == 0) {
-        rewriter.replaceOp(
-            op, lowerParallel(op, /*lhsIndex=*/-1, rhsIndex, rewriter));
-        return matchSuccess();
-      }
-    }
-
-    // Lower the first remaining reduction dimension.
-    if (!contractingDimMap.empty()) {
-      rewriter.replaceOp(op, lowerReduction(op, rewriter));
-      return matchSuccess();
-    }
-
-    return matchFailure();
-  }
-
-private:
-  // Lower one parallel dimension.
-  // TODO(ajcbik): consider reusing existing contract unrolling
-  Value lowerParallel(vector::ContractionOp op, int64_t lhsIndex,
-                      int64_t rhsIndex, PatternRewriter &rewriter) const {
-    VectorType lhsType = op.getLhsType();
-    VectorType rhsType = op.getRhsType();
-    VectorType resType = op.getResultType().cast<VectorType>();
-    // Find the iterator type index and result index.
-    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
-    int64_t iterIndex = -1;
-    int64_t dimSize = -1;
-    if (lhsIndex >= 0) {
-      iterIndex =
-          iMap[0].getResult(lhsIndex).cast<AffineDimExpr>().getPosition();
-      assert((rhsIndex < 0 || iterIndex == iMap[1]
-                                               .getResult(rhsIndex)
-                                               .cast<AffineDimExpr>()
-                                               .getPosition()) &&
-             "parallel index should be free in LHS or batch in LHS/RHS");
-      dimSize = lhsType.getDimSize(lhsIndex);
-    } else {
-      assert(rhsIndex >= 0 && "missing parallel index");
-      iterIndex =
-          iMap[1].getResult(rhsIndex).cast<AffineDimExpr>().getPosition();
-      dimSize = rhsType.getDimSize(rhsIndex);
-    }
-    assert(iterIndex >= 0 && "parallel index not listed in operand mapping");
-    Optional<int64_t> lookup = getResultIndex(iMap[2], iterIndex);
-    assert(lookup.hasValue() && "parallel index not listed in reduction");
-    int64_t resIndex = lookup.getValue();
-    // Construct new iterator types and affine map array attribute.
-    SmallVector<AffineMap, 4> lowIndexingMaps;
-    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
-    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-    auto lowIter =
-        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
-    // Unroll into a series of lower dimensional vector.contract ops.
-    Location loc = op.getLoc();
-    Value result = zeroVector(loc, resType, rewriter);
-    for (int64_t d = 0; d < dimSize; ++d) {
-      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
-      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
-      auto acc = reshapeLoad(loc, op.acc(), resType, resIndex, d, rewriter);
-      Value lowContract = rewriter.create<vector::ContractionOp>(
-          loc, lhs, rhs, acc, lowAffine, lowIter);
-      result = reshapeStore(loc, lowContract, result, resType, resIndex, d,
-                            rewriter);
-    }
-    return result;
-  }
-
-  // Lower one reduction dimension.
-  Value lowerReduction(vector::ContractionOp op,
-                       PatternRewriter &rewriter) const {
-    auto loc = op.getLoc();
-    VectorType lhsType = op.getLhsType();
-    VectorType rhsType = op.getRhsType();
-    Type resType = op.getResultType();
-    assert(!resType.isa<VectorType>());
-    // Use iterator index 0.
-    int64_t iterIndex = 0;
-    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
-    Optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
-    Optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
-    assert(lookupLhs.hasValue() && "missing LHS parallel index");
-    assert(lookupRhs.hasValue() && "missing RHS parallel index");
-    int64_t lhsIndex = lookupLhs.getValue();
-    int64_t rhsIndex = lookupRhs.getValue();
-    int64_t dimSize = lhsType.getDimSize(lhsIndex);
-    assert(dimSize == rhsType.getDimSize(rhsIndex) && "corrupt shape");
-    // Base case.
-    if (lhsType.getRank() == 1) {
-      assert(rhsType.getRank() == 1 && "corrupt contraction");
-      Value zero = zeroVector(loc, lhsType, rewriter);
-      Value fma = rewriter.create<vector::FMAOp>(loc, op.lhs(), op.rhs(), zero);
-      StringAttr kind = rewriter.getStringAttr("add");
-      return rewriter.create<vector::ReductionOp>(loc, resType, kind, fma,
-                                                  op.acc());
-    }
-    // Construct new iterator types and affine map array attribute.
-    SmallVector<AffineMap, 4> lowIndexingMaps;
-    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
-    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-    auto lowIter =
-        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
-    // Unroll into a series of lower dimensional vector.contract ops.
-    // By feeding the initial accumulator into the first contraction,
-    // and the result of each contraction into the next, eventually
-    // the sum of all reductions is computed.
-    Value result = op.acc();
-    for (int64_t d = 0; d < dimSize; ++d) {
-      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
-      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
-      result = rewriter.create<vector::ContractionOp>(loc, lhs, rhs, result,
-                                                      lowAffine, lowIter);
-    }
-    return result;
-  }
-
-  // Helper method to construct a zero vector.
-  static Value zeroVector(Location loc, VectorType vType,
-                          PatternRewriter &rewriter) {
-    Type eltType = vType.getElementType();
-    Value zero = rewriter.create<ConstantOp>(loc, eltType,
-                                             rewriter.getZeroAttr(eltType));
-    return rewriter.create<SplatOp>(loc, vType, zero);
-  }
-
-  // Helper to find an index in an affine map.
-  static Optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
-    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
-      if (idx == index)
-        return i;
-    }
-    return None;
-  }
-
-  // Helper to construct iterator types with one index removed.
-  static SmallVector<Attribute, 4> adjustIter(ArrayAttr iteratorTypes,
-                                              int64_t index) {
-    SmallVector<Attribute, 4> results;
-    for (auto it : llvm::enumerate(iteratorTypes)) {
-      int64_t idx = it.index();
-      if (idx == index)
-        continue;
-      results.push_back(it.value());
-    }
-    return results;
-  }
-
-  // Helper to construct an affine map with one index removed.
-  static AffineMap adjustMap(AffineMap map, int64_t index,
-                             PatternRewriter &rewriter) {
-    auto *ctx = rewriter.getContext();
-    SmallVector<AffineExpr, 4> results;
-    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
-      if (idx == index)
-        continue;
-      // Re-insert remaining indices, but renamed when occurring
-      // after the removed index.
-      auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
-      results.push_back(targetExpr);
-    }
-    // The (...) -> () affine map has its own factory method.
-    return results.empty() ? AffineMap::get(map.getNumDims() - 1, 0, ctx)
-                           : AffineMap::get(map.getNumDims() - 1, 0, results);
-  }
-
-  // Helper to drop dimension from vector type.
-  static Type adjustType(VectorType tp, int64_t index) {
-    int64_t rank = tp.getRank();
-    Type eltType = tp.getElementType();
-    if (rank == 1) {
-      assert(index == 0 && "index for scalar result out of bounds");
-      return eltType;
-    }
-    SmallVector<int64_t, 4> adjustedShape;
-    for (int64_t i = 0; i < rank; ++i) {
-      // Omit dimension at the given index.
-      if (i == index)
-        continue;
-      // Otherwise, add dimension back.
-      adjustedShape.push_back(tp.getDimSize(i));
-    }
-    return VectorType::get(adjustedShape, eltType);
-  }
-
-  // Helper method to possibly drop a dimension in a load.
-  // TODO(ajcbik): use a reshaping vector load (and share lowering code)
-  static Value reshapeLoad(Location loc, Value val, VectorType type,
-                           int64_t index, int64_t pos,
-                           PatternRewriter &rewriter) {
-    if (index == -1)
-      return val;
-    Type lowType = adjustType(type, 0);
-    // At extraction dimension?
-    if (index == 0) {
-      auto posAttr = rewriter.getI64ArrayAttr(pos);
-      return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
-    }
-    // Unroll leading dimensions.
-    VectorType vType = lowType.cast<VectorType>();
-    VectorType resType = adjustType(type, index).cast<VectorType>();
-    Value result = zeroVector(loc, resType, rewriter);
-    for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {
-      auto posAttr = rewriter.getI64ArrayAttr(d);
-      Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
-      Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
-      result = rewriter.create<vector::InsertOp>(loc, resType, load, result,
-                                                 posAttr);
-    }
-    return result;
-  }
-
-  // Helper method to possibly drop a dimension in a store.
-  // TODO(ajcbik): use a reshaping vector store (and share lowering code)
-  static Value reshapeStore(Location loc, Value val, Value result,
-                            VectorType type, int64_t index, int64_t pos,
-                            PatternRewriter &rewriter) {
-    // Unmodified?
-    if (index == -1)
-      return val;
-    // At insertion dimension?
-    if (index == 0) {
-      auto posAttr = rewriter.getI64ArrayAttr(pos);
-      return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
-    }
-    // Unroll leading dimensions.
-    Type lowType = adjustType(type, 0);
-    VectorType vType = lowType.cast<VectorType>();
-    Type insType = adjustType(vType, 0);
-    for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
-      auto posAttr = rewriter.getI64ArrayAttr(d);
-      Value ext =
-          rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
-      Value ins =
-          rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
-      Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
-      result =
-          rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
-    }
-    return result;
-  }
-};
-
-/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
-/// vectors progressively on the way to target llvm.matrix intrinsics.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.extract from 2-D + vector.insert_strided_slice offset into 1-D
-class ShapeCastOp2DDownCastRewritePattern
-    : public OpRewritePattern<vector::ShapeCastOp> {
-public:
-  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ShapeCastOp op,
-                                     PatternRewriter &rewriter) const override {
-    auto sourceVectorType = op.getSourceVectorType();
-    auto resultVectorType = op.getResultVectorType();
-    if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
-      return matchFailure();
-
-    auto loc = op.getLoc();
-    auto elemType = sourceVectorType.getElementType();
-    Value zero = rewriter.create<ConstantOp>(loc, elemType,
-                                             rewriter.getZeroAttr(elemType));
-    Value desc = rewriter.create<SplatOp>(loc, resultVectorType, zero);
-    unsigned mostMinorVectorSize = sourceVectorType.getShape()[1];
-    for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::ExtractOp>(loc, op.source(), i);
-      desc = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, vec, desc,
-          /*offsets=*/i * mostMinorVectorSize, /*strides=*/1);
-    }
-    rewriter.replaceOp(op, desc);
-    return matchSuccess();
-  }
-};
-
-/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
-/// vectors progressively on the way from targeting llvm.matrix intrinsics.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.strided_slice from 1-D + vector.insert into 2-D
-class ShapeCastOp2DUpCastRewritePattern
-    : public OpRewritePattern<vector::ShapeCastOp> {
-public:
-  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
-
-  PatternMatchResult matchAndRewrite(vector::ShapeCastOp op,
-                                     PatternRewriter &rewriter) const override {
-    auto sourceVectorType = op.getSourceVectorType();
-    auto resultVectorType = op.getResultVectorType();
-    if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
-      return matchFailure();
-
-    auto loc = op.getLoc();
-    auto elemType = sourceVectorType.getElementType();
-    Value zero = rewriter.create<ConstantOp>(loc, elemType,
-                                             rewriter.getZeroAttr(elemType));
-    Value desc = rewriter.create<SplatOp>(loc, resultVectorType, zero);
-    unsigned mostMinorVectorSize = resultVectorType.getShape()[1];
-    for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::StridedSliceOp>(
-          loc, op.source(), /*offsets=*/i * mostMinorVectorSize,
-          /*sizes=*/mostMinorVectorSize,
-          /*strides=*/1);
-      desc = rewriter.create<vector::InsertOp>(loc, vec, desc, i);
-    }
-    rewriter.replaceOp(op, desc);
-    return matchSuccess();
-  }
-};
-
-} // namespace
-
-// TODO(andydavis) Add pattern to rewrite ExtractSlices(ConstantMaskOp).
-// TODO(andydavis) Add this as DRR pattern.
-void mlir::vector::populateVectorToVectorTransformationPatterns(
-    OwningRewritePatternList &patterns, MLIRContext *context) {
-  patterns.insert<ShapeCastOpDecomposer, ShapeCastOpFolder, SplitTransferReadOp,
-                  SplitTransferWriteOp, TupleGetFolderOp>(context);
-}
-
-void mlir::vector::populateVectorSlicesLoweringPatterns(
-    OwningRewritePatternList &patterns, MLIRContext *context) {
-  patterns.insert<ExtractSlicesOpLowering, InsertSlicesOpLowering>(context);
-}
-
-void mlir::vector::populateVectorContractLoweringPatterns(
-    OwningRewritePatternList &patterns, MLIRContext *context) {
-  patterns.insert<ContractionOpLowering, ShapeCastOp2DDownCastRewritePattern,
-                  ShapeCastOp2DUpCastRewritePattern, OuterProductOpLowering>(
-      context);
-}

diff  --git a/mlir/lib/Dialect/VectorOps/VectorUtils.cpp b/mlir/lib/Dialect/VectorOps/VectorUtils.cpp
deleted file mode 100644
index 46a990080a4f..000000000000
--- a/mlir/lib/Dialect/VectorOps/VectorUtils.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-//===- VectorUtils.cpp - MLIR Utilities for VectorOps   ------------------===//
-//
-// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements utility methods for working with the VectorOps dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/VectorOps/VectorUtils.h"
-#include "mlir/Analysis/LoopAnalysis.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/VectorOps/VectorOps.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/IntegerSet.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Support/Functional.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/MathExtras.h"
-#include "mlir/Support/STLExtras.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SetVector.h"
-
-using llvm::SetVector;
-
-namespace mlir {
-
-SmallVector<int64_t, 4> computeStrides(ArrayRef<int64_t> shape,
-                                       ArrayRef<int64_t> sizes) {
-  int64_t rank = shape.size();
-  // Compute the count for each dimension.
-  SmallVector<int64_t, 4> sliceDimCounts(rank);
-  for (int64_t r = 0; r < rank; ++r)
-    sliceDimCounts[r] = ceilDiv(shape[r], sizes[r]);
-  // Use that to compute the slice stride for each dimension.
-  SmallVector<int64_t, 4> sliceStrides(rank);
-  sliceStrides[rank - 1] = 1;
-  for (int64_t r = rank - 2; r >= 0; --r)
-    sliceStrides[r] = sliceStrides[r + 1] * sliceDimCounts[r + 1];
-  return sliceStrides;
-}
-
-SmallVector<int64_t, 4> delinearize(ArrayRef<int64_t> sliceStrides,
-                                    int64_t index) {
-  int64_t rank = sliceStrides.size();
-  SmallVector<int64_t, 4> vectorOffsets(rank);
-  for (int64_t r = 0; r < rank; ++r) {
-    assert(sliceStrides[r] > 0);
-    vectorOffsets[r] = index / sliceStrides[r];
-    index %= sliceStrides[r];
-  }
-  return vectorOffsets;
-}
-
-SmallVector<int64_t, 4>
-computeElementOffsetsFromVectorSliceOffsets(ArrayRef<int64_t> sizes,
-                                            ArrayRef<int64_t> vectorOffsets) {
-  return functional::zipMap([](int64_t v1, int64_t v2) { return v1 * v2; },
-                            vectorOffsets, sizes);
-}
-
-SmallVector<int64_t, 4> computeSliceSizes(ArrayRef<int64_t> shape,
-                                          ArrayRef<int64_t> sizes,
-                                          ArrayRef<int64_t> elementOffsets) {
-  int64_t rank = shape.size();
-  SmallVector<int64_t, 4> sliceSizes(rank);
-  for (unsigned r = 0; r < rank; ++r)
-    sliceSizes[r] = std::min(sizes[r], shape[r] - elementOffsets[r]);
-  return sliceSizes;
-}
-
-Optional<SmallVector<int64_t, 4>> shapeRatio(ArrayRef<int64_t> superShape,
-                                             ArrayRef<int64_t> subShape) {
-  if (superShape.size() < subShape.size()) {
-    return Optional<SmallVector<int64_t, 4>>();
-  }
-
-  // Starting from the end, compute the integer divisors.
-  // Set the boolean `divides` if integral division is not possible.
-  std::vector<int64_t> result;
-  result.reserve(superShape.size());
-  bool divides = true;
-  auto divide = [&divides, &result](int superSize, int subSize) {
-    assert(superSize > 0 && "superSize must be > 0");
-    assert(subSize > 0 && "subSize must be > 0");
-    divides &= (superSize % subSize == 0);
-    result.push_back(superSize / subSize);
-  };
-  functional::zipApply(
-      divide, SmallVector<int64_t, 8>{superShape.rbegin(), superShape.rend()},
-      SmallVector<int64_t, 8>{subShape.rbegin(), subShape.rend()});
-
-  // If integral division does not occur, return and let the caller decide.
-  if (!divides) {
-    return None;
-  }
-
-  // At this point we computed the ratio (in reverse) for the common
-  // size. Fill with the remaining entries from the super-vector shape (still in
-  // reverse).
-  int commonSize = subShape.size();
-  std::copy(superShape.rbegin() + commonSize, superShape.rend(),
-            std::back_inserter(result));
-
-  assert(result.size() == superShape.size() &&
-         "super to sub shape ratio is not of the same size as the super rank");
-
-  // Reverse again to get it back in the proper order and return.
-  return SmallVector<int64_t, 4>{result.rbegin(), result.rend()};
-}
-
-Optional<SmallVector<int64_t, 4>> shapeRatio(VectorType superVectorType,
-                                             VectorType subVectorType) {
-  assert(superVectorType.getElementType() == subVectorType.getElementType() &&
-         "vector types must be of the same elemental type");
-  return shapeRatio(superVectorType.getShape(), subVectorType.getShape());
-}
-
-/// Constructs a permutation map from memref indices to vector dimension.
-///
-/// The implementation uses the knowledge of the mapping of enclosing loop to
-/// vector dimension. `enclosingLoopToVectorDim` carries this information as a
-/// map with:
-///   - keys representing "vectorized enclosing loops";
-///   - values representing the corresponding vector dimension.
-/// The algorithm traverses "vectorized enclosing loops" and extracts the
-/// at-most-one MemRef index that is invariant along said loop. This index is
-/// guaranteed to be at most one by construction: otherwise the MemRef is not
-/// vectorizable.
-/// If this invariant index is found, it is added to the permutation_map at the
-/// proper vector dimension.
-/// If no index is found to be invariant, 0 is added to the permutation_map and
-/// corresponds to a vector broadcast along that dimension.
-///
-/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
-/// signalling that no permutation map can be constructed given
-/// `enclosingLoopToVectorDim`.
-///
-/// Examples can be found in the documentation of `makePermutationMap`, in the
-/// header file.
-static AffineMap makePermutationMap(
-    ArrayRef<Value> indices,
-    const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
-  if (enclosingLoopToVectorDim.empty())
-    return AffineMap();
-  MLIRContext *context =
-      enclosingLoopToVectorDim.begin()->getFirst()->getContext();
-  using functional::makePtrDynCaster;
-  using functional::map;
-  SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
-                                  getAffineConstantExpr(0, context));
-
-  for (auto kvp : enclosingLoopToVectorDim) {
-    assert(kvp.second < perm.size());
-    auto invariants = getInvariantAccesses(
-        cast<AffineForOp>(kvp.first).getInductionVar(), indices);
-    unsigned numIndices = indices.size();
-    unsigned countInvariantIndices = 0;
-    for (unsigned dim = 0; dim < numIndices; ++dim) {
-      if (!invariants.count(indices[dim])) {
-        assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
-               "permutationMap already has an entry along dim");
-        perm[kvp.second] = getAffineDimExpr(dim, context);
-      } else {
-        ++countInvariantIndices;
-      }
-    }
-    assert((countInvariantIndices == numIndices ||
-            countInvariantIndices == numIndices - 1) &&
-           "Vectorization prerequisite violated: at most 1 index may be "
-           "invariant wrt a vectorized loop");
-  }
-  return AffineMap::get(indices.size(), 0, perm);
-}
-
-/// Implementation detail that walks up the parents and records the ones with
-/// the specified type.
-/// TODO(ntv): could also be implemented as a collect parents followed by a
-/// filter and made available outside this file.
-template <typename T>
-static SetVector<Operation *> getParentsOfType(Operation *op) {
-  SetVector<Operation *> res;
-  auto *current = op;
-  while (auto *parent = current->getParentOp()) {
-    if (auto typedParent = dyn_cast<T>(parent)) {
-      assert(res.count(parent) == 0 && "Already inserted");
-      res.insert(parent);
-    }
-    current = parent;
-  }
-  return res;
-}
-
-/// Returns the enclosing AffineForOp, from closest to farthest.
-static SetVector<Operation *> getEnclosingforOps(Operation *op) {
-  return getParentsOfType<AffineForOp>(op);
-}
-
-AffineMap
-makePermutationMap(Operation *op, ArrayRef<Value> indices,
-                   const DenseMap<Operation *, unsigned> &loopToVectorDim) {
-  DenseMap<Operation *, unsigned> enclosingLoopToVectorDim;
-  auto enclosingLoops = getEnclosingforOps(op);
-  for (auto *forInst : enclosingLoops) {
-    auto it = loopToVectorDim.find(forInst);
-    if (it != loopToVectorDim.end()) {
-      enclosingLoopToVectorDim.insert(*it);
-    }
-  }
-  return makePermutationMap(indices, enclosingLoopToVectorDim);
-}
-
-bool matcher::operatesOnSuperVectorsOf(Operation &op,
-                                       VectorType subVectorType) {
-  // First, extract the vector type and distinguish between:
-  //   a. ops that *must* lower a super-vector (i.e. vector.transfer_read,
-  //      vector.transfer_write); and
-  //   b. ops that *may* lower a super-vector (all other ops).
-  // The ops that *may* lower a super-vector only do so if the super-vector to
-  // sub-vector ratio exists. The ops that *must* lower a super-vector are
-  // explicitly checked for this property.
-  /// TODO(ntv): there should be a single function for all ops to do this so we
-  /// do not have to special case. Maybe a trait, or just a method, unclear atm.
-  bool mustDivide = false;
-  (void)mustDivide;
-  VectorType superVectorType;
-  if (auto read = dyn_cast<vector::TransferReadOp>(op)) {
-    superVectorType = read.getVectorType();
-    mustDivide = true;
-  } else if (auto write = dyn_cast<vector::TransferWriteOp>(op)) {
-    superVectorType = write.getVectorType();
-    mustDivide = true;
-  } else if (op.getNumResults() == 0) {
-    if (!isa<ReturnOp>(op)) {
-      op.emitError("NYI: assuming only return operations can have 0 "
-                   " results at this point");
-    }
-    return false;
-  } else if (op.getNumResults() == 1) {
-    if (auto v = op.getResult(0).getType().dyn_cast<VectorType>()) {
-      superVectorType = v;
-    } else {
-      // Not a vector type.
-      return false;
-    }
-  } else {
-    // Not a vector.transfer and has more than 1 result, fail hard for now to
-    // wake us up when something changes.
-    op.emitError("NYI: operation has more than 1 result");
-    return false;
-  }
-
-  // Get the ratio.
-  auto ratio = shapeRatio(superVectorType, subVectorType);
-
-  // Sanity check.
-  assert((ratio.hasValue() || !mustDivide) &&
-         "vector.transfer operation in which super-vector size is not an"
-         " integer multiple of sub-vector size");
-
-  // This catches cases that are not strictly necessary to have multiplicity but
-  // still aren't divisible by the sub-vector shape.
-  // This could be useful information if we wanted to reshape at the level of
-  // the vector type (but we would have to look at the compute and distinguish
-  // between parallel, reduction and possibly other cases.
-  if (!ratio.hasValue()) {
-    return false;
-  }
-
-  return true;
-}
-
-} // namespace mlir