[Mlir-commits] [mlir] e708471 - [mlir][NFC] Cleanup AffineOps directory structure

Fri Mar 20 14:24:01 PDT 2020

Author: Rob Suderman
Date: 2020-03-20T14:23:43-07:00
New Revision: e708471395b685f3edec2e8c1ab320358640ae74

URL: https://github.com/llvm/llvm-project/commit/e708471395b685f3edec2e8c1ab320358640ae74
DIFF: https://github.com/llvm/llvm-project/commit/e708471395b685f3edec2e8c1ab320358640ae74.diff

LOG: [mlir][NFC] Cleanup AffineOps directory structure

Summary:
Change AffineOps Dialect structure to better group both IR and Tranforms. This included extracting transforms directly related to AffineOps. Also move AffineOps to Affine.

Differential Revision: https://reviews.llvm.org/D76161

Added: 
    mlir/include/mlir/Dialect/Affine/CMakeLists.txt
    mlir/include/mlir/Dialect/Affine/EDSC/Builders.h
    mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h
    mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
    mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
    mlir/include/mlir/Dialect/Affine/IR/AffineOpsBase.td
    mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
    mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt
    mlir/include/mlir/Dialect/Affine/Passes.h
    mlir/include/mlir/Dialect/VectorOps/VectorOps.td
    mlir/lib/Dialect/Affine/CMakeLists.txt
    mlir/lib/Dialect/Affine/EDSC/Builders.cpp
    mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
    mlir/lib/Dialect/Affine/IR/AffineOps.cpp
    mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
    mlir/lib/Dialect/Affine/IR/CMakeLists.txt
    mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
    mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
    mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
    mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
    mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
    mlir/lib/Dialect/VectorOps/VectorUtils.cpp
    mlir/test/Dialect/Affine/canonicalize.mlir
    mlir/test/Dialect/Affine/dma.mlir
    mlir/test/Dialect/Affine/inlining.mlir
    mlir/test/Dialect/Affine/invalid.mlir
    mlir/test/Dialect/Affine/load-store-invalid.mlir
    mlir/test/Dialect/Affine/load-store.mlir
    mlir/test/Dialect/Affine/memref-stride-calculation.mlir
    mlir/test/Dialect/Affine/ops.mlir
    mlir/test/lib/Dialect/Affine/CMakeLists.txt
    mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp

Modified: 
    mlir/docs/Interfaces.md
    mlir/docs/Tutorials/Toy/Ch-5.md
    mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
    mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
    mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
    mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
    mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
    mlir/include/mlir/Dialect/CMakeLists.txt
    mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
    mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
    mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
    mlir/include/mlir/Dialect/Vector/VectorOps.td
    mlir/include/mlir/InitAllDialects.h
    mlir/include/mlir/InitAllPasses.h
    mlir/include/mlir/Transforms/Passes.h
    mlir/lib/Analysis/AffineAnalysis.cpp
    mlir/lib/Analysis/AffineStructures.cpp
    mlir/lib/Analysis/CMakeLists.txt
    mlir/lib/Analysis/LoopAnalysis.cpp
    mlir/lib/Analysis/NestedMatcher.cpp
    mlir/lib/Analysis/SliceAnalysis.cpp
    mlir/lib/Analysis/Utils.cpp
    mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
    mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
    mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
    mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
    mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
    mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
    mlir/lib/Dialect/CMakeLists.txt
    mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
    mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
    mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
    mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
    mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
    mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
    mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
    mlir/lib/Dialect/Linalg/Utils/Utils.cpp
    mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
    mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
    mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
    mlir/lib/Dialect/Vector/CMakeLists.txt
    mlir/lib/Dialect/Vector/VectorTransforms.cpp
    mlir/lib/Dialect/Vector/VectorUtils.cpp
    mlir/lib/Transforms/CMakeLists.txt
    mlir/lib/Transforms/LoopFusion.cpp
    mlir/lib/Transforms/LoopTiling.cpp
    mlir/lib/Transforms/LoopUnroll.cpp
    mlir/lib/Transforms/LoopUnrollAndJam.cpp
    mlir/lib/Transforms/MemRefDataFlowOpt.cpp
    mlir/lib/Transforms/PipelineDataTransfer.cpp
    mlir/lib/Transforms/Utils/CMakeLists.txt
    mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
    mlir/lib/Transforms/Utils/LoopUtils.cpp
    mlir/lib/Transforms/Utils/Utils.cpp
    mlir/lib/Transforms/Vectorize.cpp
    mlir/test/EDSC/CMakeLists.txt
    mlir/test/EDSC/builder-api-test.cpp
    mlir/test/lib/Dialect/CMakeLists.txt
    mlir/test/lib/Transforms/CMakeLists.txt
    mlir/test/lib/Transforms/TestConstantFold.cpp
    mlir/test/lib/Transforms/TestLoopFusion.cpp
    mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
    mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
    mlir/test/lib/Transforms/TestParallelismDetection.cpp
    mlir/test/lib/Transforms/TestVectorizationUtils.cpp
    mlir/tools/mlir-opt/CMakeLists.txt

Removed: 
    mlir/include/mlir/Dialect/AffineOps/AffineOps.h
    mlir/include/mlir/Dialect/AffineOps/AffineOps.td
    mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
    mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h
    mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
    mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h
    mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h
    mlir/lib/Dialect/AffineOps/AffineOps.cpp
    mlir/lib/Dialect/AffineOps/AffineValueMap.cpp
    mlir/lib/Dialect/AffineOps/CMakeLists.txt
    mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp
    mlir/lib/Transforms/AffineDataCopyGeneration.cpp
    mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp
    mlir/lib/Transforms/SimplifyAffineStructures.cpp
    mlir/test/Dialect/AffineOps/canonicalize.mlir
    mlir/test/Dialect/AffineOps/dma.mlir
    mlir/test/Dialect/AffineOps/inlining.mlir
    mlir/test/Dialect/AffineOps/invalid.mlir
    mlir/test/Dialect/AffineOps/load-store-invalid.mlir
    mlir/test/Dialect/AffineOps/load-store.mlir
    mlir/test/Dialect/AffineOps/memref-stride-calculation.mlir
    mlir/test/Dialect/AffineOps/ops.mlir
    mlir/test/lib/Transforms/TestAffineDataCopy.cpp


################################################################################
diff  --git a/mlir/docs/Interfaces.md b/mlir/docs/Interfaces.md
index f413cac28bb0..16422a104582 100644

--- a/mlir/docs/Interfaces.md
+++ b/mlir/docs/Interfaces.md
@@ -63,7 +63,7 @@ struct AffineInlinerInterface : public DialectInlinerInterface {
 };
 
 /// Register the interface with the dialect.
-AffineOpsDialect::AffineOpsDialect(MLIRContext *context) ... {
+AffineDialect::AffineDialect(MLIRContext *context) ... {
   addInterfaces<AffineInlinerInterface>();
 }
 ```

diff  --git a/mlir/docs/Tutorials/Toy/Ch-5.md b/mlir/docs/Tutorials/Toy/Ch-5.md
index dbc545c49206..8f32a7289a61 100644
--- a/mlir/docs/Tutorials/Toy/Ch-5.md
+++ b/mlir/docs/Tutorials/Toy/Ch-5.md
@@ -62,7 +62,7 @@ void ToyToAffineLoweringPass::runOnFunction() {
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to a combination of the
   // `Affine` and `Standard` dialects.
-  target.addLegalDialect<mlir::AffineOpsDialect, mlir::StandardOpsDialect>();
+  target.addLegalDialect<mlir::AffineDialect, mlir::StandardOpsDialect>();
 
   // We also define the Toy dialect as Illegal so that the conversion will fail
   // if any of these operations are *not* converted. Given that we actually want

diff  --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index 9559402708c8..249f17b0fc71 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -15,7 +15,7 @@
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() {
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to a combination of the
   // `Affine` and `Standard` dialects.
-  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+  target.addLegalDialect<AffineDialect, StandardOpsDialect>();
 
   // We also define the Toy dialect as Illegal so that the conversion will fail
   // if any of these operations are *not* converted. Given that we actually want

diff  --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index 9559402708c8..249f17b0fc71 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -15,7 +15,7 @@
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() {
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to a combination of the
   // `Affine` and `Standard` dialects.
-  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+  target.addLegalDialect<AffineDialect, StandardOpsDialect>();
 
   // We also define the Toy dialect as Illegal so that the conversion will fail
   // if any of these operations are *not* converted. Given that we actually want

diff  --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 5455738dff2a..f6dcba229276 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -19,7 +19,7 @@
 #include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"

diff  --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index 9559402708c8..249f17b0fc71 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -15,7 +15,7 @@
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() {
   // We define the specific operations, or dialects, that are legal targets for
   // this lowering. In our case, we are lowering to a combination of the
   // `Affine` and `Standard` dialects.
-  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+  target.addLegalDialect<AffineDialect, StandardOpsDialect>();
 
   // We also define the Toy dialect as Illegal so that the conversion will fail
   // if any of these operations are *not* converted. Given that we actually want

diff  --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 5455738dff2a..f6dcba229276 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -19,7 +19,7 @@
 #include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"

diff  --git a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt
new file mode 100644
index 000000000000..f33061b2d87c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)

diff  --git a/mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h
similarity index 97%
rename from mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h
rename to mlir/include/mlir/Dialect/Affine/EDSC/Builders.h
index f750a1d41f8a..9c320ece2209 100644
--- a/mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h
+++ b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_
-#define MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_
+#ifndef MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_
+#define MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/EDSC/Builders.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
@@ -203,4 +203,4 @@ ValueHandle TemplatedIndexedValue<Load, Store>::operator>=(ValueHandle e) {
 } // namespace edsc
 } // namespace mlir
 
-#endif // MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_
+#endif // MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_

diff  --git a/mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h b/mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h
similarity index 87%
rename from mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h
rename to mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h
index 67d4ac16bb0b..392e2433b992 100644
--- a/mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h
+++ b/mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h
@@ -5,10 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#ifndef MLIR_DIALECT_AFFINEOPS_EDSC_INTRINSICS_H_
-#define MLIR_DIALECT_AFFINEOPS_EDSC_INTRINSICS_H_
+#ifndef MLIR_DIALECT_AFFINE_EDSC_INTRINSICS_H_
+#define MLIR_DIALECT_AFFINE_EDSC_INTRINSICS_H_
 
-#include "mlir/Dialect/AffineOps/EDSC/Builders.h"
+#include "mlir/Dialect/Affine/EDSC/Builders.h"
 #include "mlir/EDSC/Intrinsics.h"
 
 namespace mlir {

diff  --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.h b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
similarity index 99%
rename from mlir/include/mlir/Dialect/AffineOps/AffineOps.h
rename to mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
index edae534f12ba..6ce38bcddddc 100644
--- a/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
-#define MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
+#ifndef MLIR_DIALECT_AFFINE_IR_AFFINEOPS_H
+#define MLIR_DIALECT_AFFINE_IR_AFFINEOPS_H
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
@@ -493,10 +493,10 @@ AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
 void fullyComposeAffineMapAndOperands(AffineMap *map,
                                       SmallVectorImpl<Value> *operands);
 
-#include "mlir/Dialect/AffineOps/AffineOpsDialect.h.inc"
+#include "mlir/Dialect/Affine/IR/AffineOpsDialect.h.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/AffineOps/AffineOps.h.inc"
+#include "mlir/Dialect/Affine/IR/AffineOps.h.inc"
 
 /// Returns if the provided value is the induction variable of a AffineForOp.
 bool isForInductionVar(Value val);

diff  --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
similarity index 99%
rename from mlir/include/mlir/Dialect/AffineOps/AffineOps.td
rename to mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 307860f1622b..6994b5f17661 100644
--- a/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -13,11 +13,11 @@
 #ifndef AFFINE_OPS
 #define AFFINE_OPS
 
-include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffects.td"
 
-def AffineOps_Dialect : Dialect {
+def Affine_Dialect : Dialect {
   let name = "affine";
   let cppNamespace = "";
   let hasConstantMaterializer = 1;
@@ -25,7 +25,7 @@ def AffineOps_Dialect : Dialect {
 
 // Base class for Affine dialect ops.
 class Affine_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<AffineOps_Dialect, mnemonic, traits> {
+    Op<Affine_Dialect, mnemonic, traits> {
   // For every affine op, there needs to be a:
   //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
   //   * LogicalResult verify(${C++ class of Op} op)
@@ -291,7 +291,7 @@ def AffineIfOp : Affine_Op<"if",
 }
 
 class AffineMinMaxOpBase<string mnemonic, list<OpTrait> traits = []> :
-    Op<AffineOps_Dialect, mnemonic, traits> {
+    Op<Affine_Dialect, mnemonic, traits> {
   let arguments = (ins AffineMapAttr:$map, Variadic<Index>:$operands);
   let results = (outs Index);
 

diff  --git a/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOpsBase.td
similarity index 100%
rename from mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
rename to mlir/include/mlir/Dialect/Affine/IR/AffineOpsBase.td

diff  --git a/mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
similarity index 96%
rename from mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h
rename to mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
index 3ec2b8559728..ffc6d73f3825 100644
--- a/mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
@@ -10,8 +10,8 @@
 // analysis purposes.
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H
-#define MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H
+#ifndef MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H
+#define MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/OperationSupport.h"
@@ -87,4 +87,4 @@ class AffineValueMap {
 
 } // namespace mlir
 
-#endif // MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H
+#endif // MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H

diff  --git a/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt
similarity index 100%
rename from mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
rename to mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt

diff  --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
new file mode 100644
index 000000000000..343a0a60f118
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -0,0 +1,48 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines a set of transforms specific for the AffineOps
+// dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AFFINE_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_AFFINE_TRANSFORMS_PASSES_H
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <limits>
+
+namespace mlir {
+
+class AffineForOp;
+class FuncOp;
+class ModuleOp;
+class Pass;
+template <typename T> class OpPassBase;
+
+/// Creates a simplification pass for affine structures (maps and sets). In
+/// addition, this pass also normalizes memrefs to have the trivial (identity)
+/// layout map.
+std::unique_ptr<OpPassBase<FuncOp>> createSimplifyAffineStructuresPass();
+
+/// Creates a loop invariant code motion pass that hoists loop invariant
+/// instructions out of affine loop.
+std::unique_ptr<OpPassBase<FuncOp>> createAffineLoopInvariantCodeMotionPass();
+
+/// Performs packing (or explicit copying) of accessed memref regions into
+/// buffers in the specified faster memory space through either pointwise copies
+/// or DMA operations.
+std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace,
+    unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
+    uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_AFFINE_RANSFORMS_PASSES_H

diff  --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
index 32b24264ba69..aabb2fdc6e83 100644
--- a/mlir/include/mlir/Dialect/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(AffineOps)
+add_subdirectory(Affine)
 add_subdirectory(AVX512)
 add_subdirectory(FxpMathOps)
 add_subdirectory(GPU)

diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index 90489299bafe..dc0c03f26cc8 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -13,7 +13,7 @@
 #ifndef LINALG_OPS
 #define LINALG_OPS
 
-include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
 include "mlir/Dialect/Linalg/IR/LinalgBase.td"
 include "mlir/Interfaces/SideEffects.td"
 

diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 457a8db7788f..5c8590fc6063 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -14,7 +14,7 @@
 #ifndef LINALG_STRUCTURED_OPS
 #define LINALG_STRUCTURED_OPS
 
-include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
 include "mlir/Dialect/Linalg/IR/LinalgBase.td"
 include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td"
 

diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
index c080ff2066d0..7fa33e4f2982 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
@@ -15,7 +15,7 @@
 
 include "mlir/Dialect/Linalg/IR/LinalgOps.td"
 include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.td"
-include "mlir/Dialect/AffineOps/AffineOps.td"
+include "mlir/Dialect/Affine/IR/AffineOps.td"
 
 def HasNoLinalgTransformMarker : CPred<[{
   !op.getAttrOfType<StringAttr>(LinalgTransforms::kLinalgTransformMarker)

diff  --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 2a791365db4f..a3864614d05f 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -13,7 +13,7 @@
 #ifndef VECTOR_OPS
 #define VECTOR_OPS
 
-include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
 include "mlir/Interfaces/SideEffects.td"
 
 def Vector_Dialect : Dialect {

diff  --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
new file mode 100644
index 000000000000..ef1fde00e5ec
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
@@ -0,0 +1,1402 @@
+//===- VectorOps.td - Vector op definitions ---------------*- tablegen -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines MLIR vector operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VECTOR_OPS
+#define VECTOR_OPS
+
+include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
+include "mlir/Interfaces/SideEffects.td"
+
+
+def VectorOps_Dialect : Dialect {
+  let name = "vector";
+  let cppNamespace = "vector";
+  let hasConstantMaterializer = 1;
+}
+
+// Base class for Vector dialect ops.
+class Vector_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<VectorOps_Dialect, mnemonic, traits> {
+  // For every vector op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// TODO(andydavis, ntv) Add an attribute to specify a 
diff erent algebra
+// with operators other than the current set: {*, +}.
+def Vector_ContractionOp :
+  Vector_Op<"contract", [NoSideEffect,
+     PredOpTrait<"first operand lhs and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     PredOpTrait<"second operand rhs and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 1>>,
+     PredOpTrait<"third operand acc and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyType:$acc,
+               Variadic<VectorOf<[I1]>>:$masks,
+               AffineMapArrayAttr:$indexing_maps, ArrayAttr:$iterator_types)>,
+    Results<(outs AnyType)> {
+  let summary = "vector contraction operation";
+  let description = [{
+    Computes the sum of products of vector elements along contracting
+    dimension pairs from 2 vectors of rank M and N respectively, adds this
+    intermediate result to the accumulator argument of rank K, and returns a
+    vector result of rank K (where K = num_lhs_free_dims + num_rhs_free_dims +
+    num_batch_dims (see dimension type descriptions below)). For K = 0 (no
+    free or batch dimensions), the accumulator and output are a scalar.
+
+    Optional vector mask arguments (produced by CreateMaskOp or ConstantMaskOp)
+    specify the dynamic dimension sizes of valid data within the lhs/rhs vector
+    arguments.
+
+    An iterator type attribute list must be specified, where each element of
+    the list represents an iterator with one of the following types:
+
+    *) "reduction": reduction dimensions are present in the lhs and rhs
+                    arguments but not in the output (and accumulator
+                    argument). These are the dimensions along which the vector
+                    contraction op computes the sum of products, and
+                    contracting dimension pair dimension sizes must match
+                    between lhs/rhs.
+    *) "parallel": Batch dimensions are iterator type "parallel", and
+                   are non-contracting dimensions present in the lhs, rhs and
+                   output. The lhs/rhs co-iterate along the batch dimensions,
+                   which should be expressed in their indexing maps.
+
+                   Free dimensions are iterator type "parallel", and are
+                   non-contraction, non-batch dimensions accessed by either the
+                   lhs or rhs (but not both). The lhs and rhs free dimensions
+                   are unrelated to each other and do not co-iterate, which
+                   should be expressed in their indexing maps.
+
+    An indexing map attribute list must be specified with an entry for lhs, rhs
+    and acc arguments. An indexing map attribute specifies a mapping from each
+    iterator in the iterator type list, to each dimension of an N-D vector.
+
+    Examples:
+
+      // Simple dot product (K = 0).
+      #contraction_accesses = [
+       affine_map<(i) -> (i)>,
+       affine_map<(i) -> (i)>,
+       affine_map<(i) -> ()>
+      ]
+      #contraction_trait = {
+        indexing_maps = #contraction_accesses,
+        iterator_types = ["reduction"]
+      }
+      %3 = vector.contract #contraction_trait %0, %1, %2
+        : vector<10xf32>, vector<10xf32> into f32
+
+      // 2D vector contraction with one contracting dimension (matmul, K = 2).
+      #contraction_accesses = [
+        affine_map<(i, j, k) -> (i, k)>,
+        affine_map<(i, j, k) -> (k, j)>,
+        affine_map<(i, j, k) -> (i, j)>
+      ]
+      #contraction_trait = {
+        indexing_maps = #contraction_accesses,
+        iterator_types = ["parallel", "parallel", "reduction"]
+      }
+
+      %3 = vector.contract #contraction_trait %0, %1, %2
+        : vector<4x3xf32>, vector<3x7xf32> into vector<4x7xf32>
+
+      // 4D to 3D vector contraction with two contracting dimensions and
+      // one batch dimension (K = 3).
+      #contraction_accesses = [
+        affine_map<(b0, f0, f1, c0, c1) -> (c0, b0, c1, f0)>,
+        affine_map<(b0, f0, f1, c0, c1) -> (b0, c1, c0, f1)>,
+        affine_map<(b0, f0, f1, c0, c1) -> (b0, f0, f1)>
+      ]
+      #contraction_trait = {
+        indexing_maps = #contraction_accesses,
+        iterator_types = ["parallel", "parallel", "parallel",
+                          "reduction", "reduction"]
+      }
+
+      %4 = vector.contract #contraction_trait %0, %1, %2
+          : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+
+      // 4D vector contraction with two contracting dimensions and optional
+      // vector mask arguments.
+      %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
+      %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
+
+      %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask
+         : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value lhs, Value rhs, "
+    "Value acc, ArrayAttr indexingMaps, ArrayAttr iteratorTypes">,
+    OpBuilder<
+      "Builder *builder, OperationState &result, Value lhs, Value rhs, "
+      "Value acc, ArrayRef<ArrayRef<AffineExpr>> indexingExprs, "
+      "ArrayRef<StringRef> iteratorTypes">];
+  let extraClassDeclaration = [{
+    VectorType getLhsType() {
+      return lhs().getType().cast<VectorType>();
+    }
+    VectorType getRhsType() {
+      return rhs().getType().cast<VectorType>();
+    }
+    Type getAccType() { return acc().getType(); }
+    VectorType getLHSVectorMaskType() {
+      if (llvm::size(masks()) != 2) return VectorType();
+      return getOperand(3).getType().cast<VectorType>();
+    }
+    VectorType getRHSVectorMaskType() {
+      if (llvm::size(masks()) != 2) return VectorType();
+      return getOperand(4).getType().cast<VectorType>();
+    }
+    Type getResultType() { return getResult().getType(); }
+    ArrayRef<StringRef> getTraitAttrNames();
+    SmallVector<AffineMap, 4> getIndexingMaps();
+    static unsigned getAccOperandIndex() { return 2; }
+
+    // Returns the bounds of each dimension in the iteration space spanned
+    // by the iterator types of this operation.
+    void getIterationBounds(SmallVectorImpl<int64_t> &iterationBounds);
+
+    // Returns a list of index maps, where there is a list entry for each
+    // op indexing map attribute (i.e. one for each input and output, with
+    // the output listed last). Each index map, maps from this operations
+    // iteration space, to vector dimensions of the maps input/output.
+    void getIterationIndexMap(
+      std::vector<DenseMap<int64_t, int64_t>> &iterationIndexMap);
+
+    std::vector<std::pair<int64_t, int64_t>> getContractingDimMap();
+    std::vector<std::pair<int64_t, int64_t>> getBatchDimMap();
+  }];
+}
+
+def Vector_ReductionOp :
+  Vector_Op<"reduction", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins StrAttr:$kind, AnyVector:$vector, Variadic<AnyType>:$acc)>,
+    Results<(outs AnyType:$dest)> {
+  let summary = "reduction operation";
+  let description = [{
+    Reduces an 1-D vector "horizontally" into a scalar using the given
+    operation (add/mul/min/max for int/fp and and/or/xor for int only).
+    Some reductions (add/mul for fp) also allow an optional fused
+    accumulator.
+
+    Note that these operations are restricted to 1-D vectors to remain
+    close to the corresponding LLVM intrinsics:
+
+    http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics
+
+    Examples:
+    ```
+      %1 = vector.reduction "add", %0 : vector<16xf32> into f32
+
+      %3 = vector.reduction "xor", %2 : vector<4xi32> into i32
+
+      %4 = vector.reduction "mul", %0, %1 : vector<16xf32> into f32
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_BroadcastOp :
+  Vector_Op<"broadcast", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyType:$source)>,
+    Results<(outs AnyVector:$vector)> {
+  let summary = "broadcast operation";
+  let description = [{
+    Broadcasts the scalar or k-D vector value in the source operand
+    to a n-D result vector such that the broadcast makes sense, i.e.,
+    the source operand is duplicated to match the given rank and sizes
+    in the result vector. The legality rules are:
+    * the source operand must have the same element type as the result type
+    * a k-D vector <s_1 x .. x s_k x type> can be broadcast to
+      a n-D vector <t_1 x .. x t_n x type> if
+       * k <= n, and
+       * the sizes in the trailing dimensions n-k < i <= n with j=i+k-n
+          match exactly as s_j = t_i or s_j = 1:
+       ```
+           t_1 x   ..  t_n-k x t_n-k+1 x .. x t_i x .. x t_n
+                               s_1     x .. x s_j x .. x s_k
+               <duplication>         <potential stretch>
+       ```
+    The source operand is duplicated over all the missing leading dimensions
+    and stretched over the trailing dimensions where the source has a non-equal
+    dimension of 1. These rules imply that any scalar broadcast (k=0) to any
+    shaped vector with the same element type is always legal.
+
+    Examples:
+    ```
+      %0 = constant 0.0 : f32
+      %1 = vector.broadcast %0 : f32 to vector<16xf32>
+      %2 = vector.broadcast %1 : vector<16xf32> to vector<4x16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    Type getSourceType() { return source().getType(); }
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($vector)";
+}
+
+def Vector_ShuffleOp :
+  Vector_Op<"shuffle", [NoSideEffect,
+     PredOpTrait<"first operand v1 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     PredOpTrait<"second operand v2 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
+     Arguments<(ins AnyVector:$v1, AnyVector:$v2, I64ArrayAttr:$mask)>,
+     Results<(outs AnyVector:$vector)> {
+  let summary = "shuffle operation";
+  let description = [{
+    The shuffle operation constructs a permutation (or duplication) of elements
+    from two input vectors, returning a vector with the same element type as
+    the input and a length that is the same as the shuffle mask. The two input
+    vectors must have the same element type, rank, and trailing dimension sizes
+    and shuffles their values in the leading dimension (which may 
diff er in size)
+    according to the given mask. The legality rules are:
+    * the two operands must have the same element type as the result
+    * the two operands and the result must have the same rank and trailing
+      dimension sizes, viz. given two k-D operands
+              v1 : <s_1 x s_2 x .. x s_k x type> and
+              v2 : <t_1 x t_2 x .. x t_k x type>
+      we have s_i = t_i for all 1 < i <= k
+    * the mask length equals the leading dimension size of the result
+    * numbering the input vector indices left to right across the operands, all
+      mask values must be within range, viz. given two k-D operands v1 and v2
+      above, all mask values are in the range [0,s_1+t_1)
+
+    Examples:
+    ```
+    %0 = vector.shuffle %a, %b[0, 3]
+               : vector<2xf32>, vector<2xf32>       ; yields vector<2xf32>
+    %1 = vector.shuffle %c, %b[0, 1, 2]
+               : vector<2x16xf32>, vector<1x16xf32> ; yields vector<3x16xf32>
+    %2 = vector.shuffle %a, %b[3, 2, 1, 0]
+               : vector<2xf32>, vector<2xf32>       ; yields vector<4xf32>
+
+    ```
+  }];
+  let builders = [OpBuilder<"Builder *builder, OperationState &result,"
+                            "Value v1, Value v2, ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getMaskAttrName() { return "mask"; }
+    VectorType getV1VectorType() {
+      return v1().getType().cast<VectorType>();
+    }
+    VectorType getV2VectorType() {
+      return v2().getType().cast<VectorType>();
+    }
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ExtractElementOp :
+  Vector_Op<"extractelement", [NoSideEffect,
+     TypesMatchWith<"result type matches element type of vector operand",
+                    "vector", "result",
+                    "$_self.cast<ShapedType>().getElementType()">]>,
+    Arguments<(ins AnyVector:$vector, AnySignlessInteger:$position)>,
+    Results<(outs AnyType:$result)> {
+  let summary = "extractelement operation";
+  let description = [{
+    Takes an 1-D vector and a dynamic index position and extracts the
+    scalar at that position. Note that this instruction resembles
+    vector.extract, but is restricted to 1-D vectors and relaxed
+    to dynamic indices. It is meant to be closer to LLVM's version:
+    https://llvm.org/docs/LangRef.html#extractelement-instruction
+
+    Example:
+    ```
+      %c = constant 15 : i32
+      %1 = vector.extractelement %0[%c : i32]: vector<16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+
+  let assemblyFormat = [{
+    $vector `[` $position `:` type($position) `]` attr-dict `:` type($vector)
+  }];
+}
+
+def Vector_ExtractOp :
+  Vector_Op<"extract", [NoSideEffect,
+     PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$position)>,
+    Results<(outs AnyType)> {
+  let summary = "extract operation";
+  let description = [{
+    Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at
+    the proper position. Degenerates to an element type in the 0-D case.
+
+    Examples:
+    ```
+      %1 = vector.extract %0[3]: vector<4x8x16xf32>
+      %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source,"
+    "ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getPositionAttrName() { return "position"; }
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ExtractSlicesOp :
+  Vector_Op<"extract_slices", [NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$sizes,
+                   I64ArrayAttr:$strides)>,
+    Results<(outs TupleOf<[AnyVector]>)> {
+  let summary = "vector extract slices operation";
+  let description = [{
+    Takes an N-d vector and returns a tuple of vector slices of 'vector',
+    based on 'sizes' and 'strides' parameters.
+
+    The arguments 'sizes' and 'strides' represent a specification for
+    generating the unrolling of 'vector' shape, which has all slices of shape
+    'sizes' except for slices at dimension boundaries when 'vector' dimension
+    sizes are not a multiple of 'sizes'.
+
+    Each slice is returned at the tuple element index corresponding to the
+    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
+    Currently, only unit strides are supported.
+
+    Examples:
+    ```
+      %0 = vector.transfer_read ...: vector<4x2xf32>
+
+      %1 = vector.extract_slices %0, [2, 2], [1, 1]
+        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+
+      // Example with partial slices at dimension boundaries.
+      %2 = vector.transfer_read ...: vector<4x3xf32>
+
+      %3 = vector.extract_slices %2, [2, 2], [1, 1]
+        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
+                                     vector<2x2xf32>, vector<2x1xf32>>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, TupleType tupleType, " #
+    "Value vector, ArrayRef<int64_t> sizes, " #
+    "ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+    TupleType getResultTupleType() {
+      return getResult().getType().cast<TupleType>();
+    }
+    void getSizes(SmallVectorImpl<int64_t> &results);
+    void getStrides(SmallVectorImpl<int64_t> &results);
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+  }];
+  let assemblyFormat = [{
+    $vector `,` $sizes `,` $strides attr-dict `:` type($vector) `into`
+    type(results)
+  }];
+}
+
+def Vector_FMAOp :
+  Op<VectorOps_Dialect, "fma", [NoSideEffect,
+                             AllTypesMatch<["lhs", "rhs", "acc", "result"]>]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc)>,
+    Results<(outs AnyVector:$result)> {
+  let summary = "vector fused multiply-add";
+  let description = [{
+    Multiply-add expressions operate on n-D vectors and compute a fused
+    pointwise multiply-and-accumulate: `$result = `$lhs * $rhs + $acc`.
+    All operands and result have the same vector type. The semantics
+    of the operation correspond to those of the `llvm.fma`
+    [intrinsic](https://llvm.org/docs/LangRef.html#int-fma). In the
+    particular case of lowering to LLVM, this is guaranteed to lower
+    to the `llvm.fma.*` intrinsic.
+
+    Example:
+
+    ```
+      %3 = vector.fma %0, %1, %2: vector<8x16xf32>
+    ```
+  }];
+  // Fully specified by traits.
+  let verifier = ?;
+  let assemblyFormat = "$lhs `,` $rhs `,` $acc attr-dict `:` type($lhs)";
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value lhs, Value rhs, Value acc",
+    "build(b, result, lhs.getType(), lhs, rhs, acc);">];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() { return lhs().getType().cast<VectorType>(); }
+  }];
+}
+
+def Vector_InsertElementOp :
+  Vector_Op<"insertelement", [NoSideEffect,
+     TypesMatchWith<"source operand type matches element type of result",
+                    "result", "source",
+                    "$_self.cast<ShapedType>().getElementType()">,
+     AllTypesMatch<["dest", "result"]>]>,
+     Arguments<(ins AnyType:$source, AnyVector:$dest,
+                    AnySignlessInteger:$position)>,
+     Results<(outs AnyVector:$result)> {
+  let summary = "insertelement operation";
+  let description = [{
+    Takes a scalar source, an 1-D destination vector and a dynamic index
+    position and inserts the source into the destination at the proper
+    position.  Note that this instruction resembles vector.insert, but
+    is restricted to 1-D vectors and relaxed to dynamic indices. It is
+    meant to be closer to LLVM's version:
+    https://llvm.org/docs/LangRef.html#insertelement-instruction
+
+    Example:
+    ```
+      %c = constant 15 : i32
+      %f = constant 0.0f : f32
+      %1 = vector.insertelement %f, %0[%c : i32]: vector<16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    Type getSourceType() { return source().getType(); }
+    VectorType getDestVectorType() {
+      return dest().getType().cast<VectorType>();
+    }
+  }];
+
+  let assemblyFormat = [{
+    $source `,` $dest `[` $position `:` type($position) `]` attr-dict `:`
+    type($result)
+  }];
+}
+
+def Vector_InsertOp :
+  Vector_Op<"insert", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     AllTypesMatch<["dest", "res"]>]>,
+     Arguments<(ins AnyType:$source, AnyVector:$dest, I64ArrayAttr:$position)>,
+     Results<(outs AnyVector:$res)> {
+  let summary = "insert operation";
+  let description = [{
+    Takes an n-D source vector, an (n+k)-D destination vector and a k-D position
+    and inserts the n-D source into the (n+k)-D destination at the proper
+    position. Degenerates to a scalar source type when n = 0.
+
+    Examples:
+    ```
+      %2 = vector.insert %0, %1[3]:
+        vector<8x16xf32> into vector<4x8x16xf32>
+      %5 = vector.insert %3, %4[3, 3, 3]:
+        f32 into vector<4x8x16xf32>
+    ```
+  }];
+  let assemblyFormat = [{
+    $source `,` $dest $position attr-dict `:` type($source) `into` type($dest)
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, " #
+    "Value dest, ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getPositionAttrName() { return "position"; }
+    Type getSourceType() { return source().getType(); }
+    VectorType getDestVectorType() {
+      return dest().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_InsertSlicesOp :
+  Vector_Op<"insert_slices", [NoSideEffect]>,
+    Arguments<(ins TupleOf<[AnyVector]>:$vectors, I64ArrayAttr:$sizes,
+                   I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector insert slices operation";
+  let description = [{
+    Takes a tuple of vector slices and inserts them into the vector result
+    according to the 'sizes' and 'strides' parameters.
+
+    The arguments 'sizes' and 'strides' represent a specification for
+    generating the unrolling of 'vector' shape, which has all slices of shape
+    'sizes' except for slices at dimension boundaries when 'vector' dimension
+    sizes are not a multiple of 'sizes'.
+
+    Each slice in 'vectors' is at the tuple element index corresponding to the
+    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
+    Currently, only unit strides are supported.
+
+    Examples:
+    ```
+      %0 = vector.extract_slices %0, [2, 2], [1, 1]
+        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+
+      %1 = vector.insert_slices %0, [2, 2], [1, 1]
+        : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+
+      // Example with partial slices at dimension boundaries.
+      %3 = vector.extract_slices %2, [2, 2], [1, 1]
+        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
+                                     vector<2x2xf32>, vector<2x1xf32>>
+
+      %4 = vector.insert_slices %3, [2, 2], [1, 1]
+        : tuple<vector<2x2xf32>, vector<2x1xf32>,
+                vector<2x2xf32>, vector<2x1xf32>> into vector<4x3xf32>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    TupleType getSourceTupleType() {
+      return vectors().getType().cast<TupleType>();
+    }
+    VectorType getResultVectorType() {
+      return getResult().getType().cast<VectorType>();
+    }
+    void getSizes(SmallVectorImpl<int64_t> &results);
+    void getStrides(SmallVectorImpl<int64_t> &results);
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+  }];
+  let assemblyFormat = [{
+    $vectors `,` $sizes `,` $strides attr-dict `:` type($vectors) `into`
+    type(results)
+  }];
+}
+
+def Vector_InsertStridedSliceOp :
+  Vector_Op<"insert_strided_slice", [NoSideEffect,
+    PredOpTrait<"operand #0 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+    AllTypesMatch<["dest", "res"]>]>,
+    Arguments<(ins AnyVector:$source, AnyVector:$dest, I64ArrayAttr:$offsets,
+               I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector:$res)> {
+  let summary = "strided_slice operation";
+  let description = [{
+    Takes a k-D source vector, an n-D destination vector (n >= k), n-sized
+    `offsets` integer array attribute, a k-sized `strides` integer array attribute
+    and inserts the k-D source vector as a strided subvector at the proper offset
+    into the n-D destination vector.
+
+    At the moment strides must contain only 1s.
+
+    Returns an n-D vector that is a copy of the n-D destination vector in which
+    the last k-D dimensions contain the k-D source vector elements strided at
+    the proper location as specified by the offsets.
+
+    Examples:
+    ```
+      %2 = vector.insert_strided_slice %0, %1
+          {offsets = [0, 0, 2], strides = [1, 1]}:
+        vector<2x4xf32> into vector<16x4x8xf32>
+    ```
+  }];
+
+  let assemblyFormat = [{
+    $source `,` $dest attr-dict `:` type($source) `into` type($dest)
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, Value dest, " #
+    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    static StringRef getOffsetsAttrName() { return "offsets"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+    VectorType getSourceVectorType() {
+      return source().getType().cast<VectorType>();
+    }
+    VectorType getDestVectorType() {
+      return dest().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_OuterProductOp :
+  Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic<AnyVector>:$acc)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector outerproduct with optional fused add";
+  let description = [{
+    Takes 2 1-D vectors and returns the 2-D vector containing the outer-product.
+
+    An optional extra 2-D vector argument may be specified in which case the
+    operation returns the sum of the outer-product and the extra vector. In this
+    multiply-accumulate scenario, the rounding mode is that obtained by
+    guaranteeing that a fused-multiply add operation is emitted. When lowered to
+    the LLVMIR dialect, this form emits `llvm.intr.fma`, which is guaranteed to
+    lower to actual `fma` instructions on x86.
+
+    Examples:
+
+    ```
+      %2 = vector.outerproduct %0, %1: vector<4xf32>, vector<8xf32>
+      return %2: vector<4x8xf32>
+
+      %3 = vector.outerproduct %0, %1, %2:
+        vector<4xf32>, vector<8xf32>, vector<4x8xf32>
+      return %3: vector<4x8xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getOperandVectorTypeLHS() {
+      return lhs().getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeRHS() {
+      return rhs().getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeACC() {
+      return (llvm::size(acc()) == 0) ? VectorType() :
+        (*acc().begin()).getType().cast<VectorType>();
+    }
+    VectorType getVectorType() {
+      return getResult().getType().cast<VectorType>();
+    }
+  }];
+}
+
+// TODO(andydavis) Add transformation which decomposes ReshapeOp into an
+// optimized sequence of vector rotate/shuffle/select operations.
+def Vector_ReshapeOp :
+  Vector_Op<"reshape", [AttrSizedOperandSegments, NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, Variadic<Index>:$input_shape,
+               Variadic<Index>:$output_shape,
+               I64ArrayAttr:$fixed_vector_sizes)>,
+    Results<(outs AnyVector:$result)> {
+  let summary = "vector reshape operation";
+  let description = [{
+    Reshapes its vector operand from 'input_shape' to 'output_shape' maintaining
+    fixed vector dimension 'fixed_vector_sizes' on the innermost vector
+    dimensions.
+
+    The parameters 'input_shape' and 'output_shape' represent valid data shapes
+    across fixed vector shapes. For example, if a vector has a valid data
+    shape [6] with fixed vector size [8], then the valid data elements are
+    assumed to be stored at the beginning of the vector with the remaining
+    vector elements undefined.
+
+    In the examples below, valid data elements are represented by an alphabetic
+    character, and undefined data elements are represented by '-'.
+
+    Example
+
+      vector<1x8xf32> with valid data shape [6], fixed vector sizes [8]
+
+                input: [a, b, c, d, e, f]
+
+           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
+
+        vector layout: [a, b, c, d, e, f, -, -]
+
+    Example
+
+      vector<2x8xf32> with valid data shape [10], fixed vector sizes [8]
+
+                input: [a, b, c, d, e, f, g, h, i, j]
+
+           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
+
+        vector layout: [[a, b, c, d, e, f, g, h],
+                        [i, j, -, -, -, -, -, -]]
+
+    Example
+
+      vector<2x2x2x3xf32> with valid data shape [3, 5], fixed vector sizes
+      [2, 3]
+
+                input: [[a, b, c, d, e],
+                        [f, g, h, i, j],
+                        [k, l, m, n, o]]
+
+           layout map: (d0, d1) -> (d0 floordiv 3, d1 floordiv 5,
+                                    d0 mod 3, d1 mod 5)
+
+        vector layout: [[[[a, b, c],
+                          [f, g, h]]
+                         [[d, e, -],
+                          [i, j, -]]],
+                        [[[k, l, m],
+                          [-, -, -]]
+                         [[n, o, -],
+                          [-, -, -]]]]
+
+    Example
+
+      %1 = vector.reshape %0, [%c3, %c6], [%c2, %c9], [4]
+        : vector<3x2x4xf32> to vector<2x3x4xf32>
+
+             input: [[a, b, c, d, e, f],
+                     [g, h, i, j, k, l],
+                     [m, n, o, p, q, r]]
+
+        layout map: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)
+
+
+      Input vector:  [[[a, b, c, d],
+                       [e, f, -, -]],
+                      [[g, h, i, j],
+                       [k, l, -, -]],
+                      [[m, n, o, p],
+                       [q, r, -, -]]]
+
+      Output vector:  [[[a, b, c, d],
+                        [e, f, g, h],
+                        [i, -, -, -]],
+                       [[j, k, l, m],
+                        [n, o, p, q],
+                        [r, -, -, -]]]
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getInputVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+    VectorType getOutputVectorType() {
+      return getResult().getType().cast<VectorType>();
+    }
+
+    /// Returns as integer value the number of input shape operands.
+    int64_t getNumInputShapeSizes() { return input_shape().size(); }
+
+    /// Returns as integer value the number of output shape operands.
+    int64_t getNumOutputShapeSizes() { return output_shape().size(); }
+
+    void getFixedVectorSizes(SmallVectorImpl<int64_t> &results);
+
+    static StringRef getFixedVectorSizesAttrName() {
+      return "fixed_vector_sizes";
+    }
+    static StringRef getInputShapeAttrName() { return "input_shape"; }
+    static StringRef getOutputShapeAttrName() { return "output_shape"; }
+  }];
+
+  let assemblyFormat = [{
+    $vector `,` `[` $input_shape `]` `,` `[` $output_shape `]` `,`
+    $fixed_vector_sizes attr-dict `:` type($vector) `to` type($result)
+  }];
+}
+
+def Vector_StridedSliceOp :
+  Vector_Op<"strided_slice", [NoSideEffect,
+    PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$offsets,
+               I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector)> {
+  let summary = "strided_slice operation";
+  let description = [{
+    Takes an n-D vector, k-D `offsets` integer array attribute, a k-sized
+    `sizes` integer array attribute, a k-sized `strides` integer array
+    attribute and extracts the n-D subvector at the proper offset.
+
+    At the moment strides must contain only 1s.
+    // TODO(ntv) support non-1 strides.
+
+    Returns an n-D vector where the first k-D dimensions match the `sizes`
+    attribute. The returned subvector contains the elements starting at offset
+    `offsets` and ending at `offsets + sizes`.
+
+    Examples:
+    ```
+      %1 = vector.strided_slice %0
+          {offsets = [0, 2], sizes = [2, 4], strides = [1, 1]}:
+        vector<4x8x16xf32> to vector<2x4x16xf32>
+    ```
+
+    // TODO(ntv) Evolve to a range form syntax similar to:
+    %1 = vector.strided_slice %0[0:2:1][2:4:1]
+      vector<4x8x16xf32> to vector<2x4x16xf32>
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, " #
+    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> sizes, " #
+    "ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    static StringRef getOffsetsAttrName() { return "offsets"; }
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+    VectorType getVectorType(){ return vector().getType().cast<VectorType>(); }
+    void getOffsets(SmallVectorImpl<int64_t> &results);
+  }];
+  let hasCanonicalizer = 1;
+  let assemblyFormat = "$vector attr-dict `:` type($vector) `to` type(results)";
+}
+
+def Vector_TransferReadOp :
+  Vector_Op<"transfer_read">,
+    Arguments<(ins AnyMemRef:$memref, Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map, AnyType:$padding)>,
+    Results<(outs AnyVector:$vector)> {
+
+  let summary = "Reads a supervector from memory into an SSA vector value.";
+
+  let description = [{
+    The `vector.transfer_read` op performs a blocking read from a slice within
+    a [MemRef](../LangRef.md#memref-type) supplied as its first operand
+    into a [vector](../LangRef.md#vector-type) of the same base elemental type.
+
+    A memref operand with vector element type, must have its vector element
+    type match a suffix (shape and element type) of the vector (e.g.
+    memref<3x2x6x4x3xf32>, vector<1x1x4x3xf32>).
+
+    The slice is further defined by a full-rank index within the MemRef,
+    supplied as the operands `2 .. 1 + rank(memref)`. The permutation_map
+    [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector, given as the return type. An `ssa-value` of the same
+    elemental type as the MemRef is provided as the last operand to specify
+    padding in the case of out-of-bounds accesses. This operation is called
+    'read' by opposition to 'load' because the super-vector granularity is
+    generally not representable with a single hardware register.
+    A `vector.transfer_read` is thus a mid-level
+    abstraction that supports super-vectorization with non-effecting padding for
+    full-tile-only code.
+
+    More precisely, let's dive deeper into the permutation_map for the following
+    MLIR:
+
+    ```mlir
+    vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4]
+      { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } :
+      memref<?x?x?x?xf32>, vector<3x4x5xf32>
+    ```
+
+    This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3,
+    %expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice
+    is: `%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]`
+
+    That slice needs to be read into a `vector<3x4x5xf32>`. Since the
+    permutation map is not full rank, there must be a broadcast along vector
+    dimension `1`.
+
+    A notional lowering of vector.transfer_read could generate code resembling:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %j = 0 to 4 {
+        affine.for %k = 0 to 5 {
+          %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+            memref<?x?x?x?xf32>
+          store %tmp[%i, %j, %k] : vector<3x4x5xf32>
+    }}}
+    %c0 = constant 0 : index
+    %vec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    ```
+
+    On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that
+    the temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are
+    actually transferred between `%A` and `%tmp`.
+
+    Alternatively, if a notional vector broadcast operation were available, the
+    lowered code would resemble:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %k = 0 to 5 {
+        %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+          memref<?x?x?x?xf32>
+        store %tmp[%i, 0, %k] : vector<3x4x5xf32>
+    }}
+    %c0 = constant 0 : index
+    %tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    %vec = broadcast %tmpvec, 1 : vector<3x4x5xf32>
+    ```
+
+    where `broadcast` broadcasts from element 0 to all others along the
+    specified dimension. This time, the temporary storage footprint is `3 * 5`
+    values which is the same amount of data as the `3 * 5` values transferred.
+    An additional `1` broadcast is required. On a GPU this broadcast could be
+    implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`.
+
+    Syntax
+    ```
+    operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list
+      `{` attribute-entry `} :` memref-type `,` vector-type
+    ```
+
+    Examples:
+
+    ```mlir
+    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32>
+    // and pad with %f0 to handle the boundary case:
+    %f0 = constant 0.0f : f32
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 256 {
+        affine.for %i2 = 0 to %2 step 32 {
+          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
+               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+               memref<?x?x?xf32>, vector<32x256xf32>
+    }}}
+
+    // Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into
+    // vector<128xf32>. The underlying implementation will require a 1-D vector
+    // broadcast:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 {
+        %3 = vector.transfer_read %A[%i0, %i1]
+             {permutation_map: (d0, d1) -> (0)} :
+             memref<?x?xf32>, vector<128xf32>
+      }
+    }
+
+    // Read from a memref with vector element type.
+    %4 = vector.transfer_read %arg1[%c3, %c3], %vf0
+      {permutation_map = (d0, d1)->(d0, d1)}
+        : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref().getType().cast<MemRefType>();
+    }
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_TransferWriteOp :
+  Vector_Op<"transfer_write">,
+    Arguments<(ins AnyVector:$vector, AnyMemRef:$memref,
+               Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map)> {
+
+  let summary = "The vector.transfer_write op writes a supervector to memory.";
+
+  let description = [{
+    The `vector.transfer_write` performs a blocking write from a
+    [vector](../LangRef.md#vector-type), supplied as its first operand, into a
+    slice within a [MemRef](../LangRef.md#memref-type) of the same base
+    elemental type, supplied as its second operand.
+
+    A vector memref operand must have its vector element type match a suffix
+    (shape and element type) of the vector (e.g. memref<3x2x6x4x3xf32>,
+    vector<1x1x4x3xf32>).
+
+    The slice is further defined by a full-rank index within the MemRef,
+    supplied as the operands `3 .. 2 + rank(memref)`.
+    The permutation_map [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector. This operation is called 'write' by opposition to
+    'store' because the super-vector granularity is generally not representable
+    with a single hardware register. A `vector.transfer_write` is thus a
+    mid-level abstraction that supports super-vectorization with non-effecting
+    padding for full-tile-only code. It is the responsibility of
+    `vector.transfer_write`'s implementation to ensure the memory writes are
+    valid. Different lowerings may be pertinent depending on the hardware
+    support.
+
+    Syntax:
+
+    ```
+    operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} :
+      ` vector-type ', ' memref-type '
+    ```
+
+    Examples:
+
+    ```mlir
+    // write vector<16x32x64xf32> into the slice
+    //   `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 32 {
+        affine.for %i2 = 0 to %2 step 64 {
+          affine.for %i3 = 0 to %3 step 16 {
+            %val = `ssa-value` : vector<16x32x64xf32>
+            vector.transfer_write %val, %A[%i0, %i1, %i2, %i3]
+              {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+              vector<16x32x64xf32>, memref<?x?x?x?xf32>
+    }}}}
+
+    // write to a memref with vector element type.
+    vector.transfer_write %4, %arg1[%c3, %c3]
+      {permutation_map = (d0, d1)->(d0, d1)}
+        : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+    MemRefType getMemRefType() {
+      return memref().getType().cast<MemRefType>();
+    }
+  }];
+  let assemblyFormat = [{
+    $vector `,` $memref `[` $indices `]` attr-dict `:` type($vector) `,`
+    type($memref)
+  }];
+}
+
+def Vector_ShapeCastOp :
+  Vector_Op<"shape_cast", [NoSideEffect]>,
+    Arguments<(ins AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$source)>,
+    Results<(outs AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$result)> {
+  let summary = "shape_cast casts between vector shapes";
+  let description = [{
+    The shape_cast operation casts between an n-D source vector shape and
+    a k-D result vector shape (the element type remains the same).
+
+    If reducing rank (n > k), result dimension sizes must be a product
+    of contiguous source dimension sizes.
+    If expanding rank (n < k), source dimensions must factor into a
+    contiguous sequence of destination dimension sizes.
+    Each source dim is expanded (or contiguous sequence of source dims combined)
+    in source dimension list order (i.e. 0 <= i < n), to produce a contiguous
+    sequence of result dims (or a single result dim), in result dimension list
+    order (i.e. 0 <= j < k). The product of all source dimension sizes and all
+    result dimension sizes must match.
+
+    If the source/result types are a tuple of vectors, the casting operation
+    described above is applied to each source/result tuple element pair.
+
+    It is currently assumed that this operation does not require moving data,
+    and that it will be folded away before lowering vector operations.
+
+    There is an exception to the folding expectation when targeting
+    llvm.intr.matrix operations. We need a type conversion back and forth from a
+    2-D MLIR vector to a 1-D flattened LLVM vector.shape_cast lowering to LLVM
+    is supported in that particular case, for now.
+
+    Examples:
+
+    ```mlir
+    // Example casting to a lower vector rank.
+    %1 = vector.shape_cast %0 : vector<5x1x4x3xf32> to vector<20x3xf32>
+
+    // Example casting to a higher vector rank.
+    %3 = vector.shape_cast %2 : vector<10x12x8xf32> to vector<5x2x3x4x8xf32>
+
+    // Example casting a tuple of vectors of same rank, where tuple elements
+    // may have 
diff erent shapes.
+    %5 = vector.shape_cast %4 : tuple<vector<3x4x2xf32>, vector<3x3x2xf32>> to
+                                tuple<vector<12x2xf32>, vector<9x2xf32>>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return source().getType().cast<VectorType>();
+    }
+    VectorType getResultVectorType() {
+      return getResult().getType().cast<VectorType>();
+    }
+  }];
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($result)";
+}
+
+def Vector_TypeCastOp :
+  Vector_Op<"type_cast", [NoSideEffect]>,
+    Arguments<(ins StaticShapeMemRefOf<[AnyType]>:$memref)>,
+    Results<(outs AnyMemRef)> {
+  let summary = "type_cast op converts a scalar memref to a vector memref";
+  let description = [{
+    Performs a conversion from a memref with scalar element to a memref with a
+    *single* vector element, copying the shape of the memref to the vector. This
+    is the minimal viable operation that is required to makeke
+    super-vectorization operational. It can be seen as a special case of the
+    `view` operation but scoped in the super-vectorization context.
+
+    Syntax:
+
+    ```
+    operation ::= `vector.type_cast` ssa-use : memref-type to memref-type
+    ```
+
+    Example:
+
+    ```mlir
+    %A  = alloc() : memref<5x4x3xf32>
+    %VA = vector.type_cast %A : memref<5x4x3xf32> to memref<vector<5x4x3xf32>>
+    ```
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source">];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref().getType().cast<MemRefType>();
+    }
+    MemRefType getResultMemRefType() {
+      return getResult().getType().cast<MemRefType>();
+    }
+  }];
+}
+
+def Vector_ConstantMaskOp :
+  Vector_Op<"constant_mask", [NoSideEffect]>,
+    Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>,
+    Results<(outs VectorOf<[I1]>)> {
+  let summary = "creates a constant vector mask";
+  let description = [{
+    Creates and returns a vector mask where elements of the result vector
+    are set to '0' or '1', based on whether the element indices are contained
+    within a hyper-rectangular region specified by the 'mask_dim_sizes'
+    array attribute argument. Each element of the 'mask_dim_sizes' array,
+    specifies an exclusive upper bound [0, mask-dim-size-element-value)
+    for a unique dimension in the vector result. The conjunction of the ranges
+    define a hyper-rectangular region within which elements values are set to 1
+    (otherwise element values are set to 0).
+
+    Example: create a constant vector mask of size 4x3xi1 with elements in range
+             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
+
+      %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
+
+      print %1
+                    columns
+                  0    1    2
+                |------------
+              0 | 1    1    0
+        rows  1 | 1    1    0
+              2 | 1    1    0
+              3 | 0    0    0
+  }];
+
+  let extraClassDeclaration = [{
+    static StringRef getMaskDimSizesAttrName() { return "mask_dim_sizes"; }
+  }];
+  let assemblyFormat = "$mask_dim_sizes attr-dict `:` type(results)";
+}
+
+def Vector_CreateMaskOp :
+  Vector_Op<"create_mask", [NoSideEffect]>,
+    Arguments<(ins Variadic<Index>:$operands)>, Results<(outs VectorOf<[I1]>)> {
+  let summary = "creates a vector mask";
+  let description = [{
+    Creates and returns a vector mask where elements of the result vector
+    are set to '0' or '1', based on whether the element indices are contained
+    within a hyper-rectangular region specified by the operands. Specifically,
+    each operand specifies a range [0, operand-value) for a unique dimension in
+    the vector result. The conjunction of the operand ranges define a
+    hyper-rectangular region within which elements values are set to 1
+    (otherwise element values are set to 0).
+
+    Example: create a vector mask of size 4x3xi1 where elements in range
+             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
+
+      %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+
+      print %1
+                    columns
+                  0    1    2
+                |------------
+              0 | 1    1    0
+        rows  1 | 1    1    0
+              2 | 1    1    0
+              3 | 0    0    0
+  }];
+
+  let hasCanonicalizer = 1;
+  let assemblyFormat = "$operands attr-dict `:` type(results)";
+}
+
+def Vector_TupleOp :
+  Vector_Op<"tuple", [NoSideEffect]>,
+    Arguments<(ins Variadic<AnyVector>:$vectors)>,
+    Results<(outs TupleOf<[AnyVector]>)> {
+  let summary = "make tuple of vectors operation";
+  let description = [{
+    Returns a tuple of its operands 'vectors'.
+
+    Note that this operation is used during the vector op unrolling
+    transformation and should be removed before lowering to lower-level
+    dialects.
+
+
+    Examples:
+    ```
+      %0 = vector.transfer_read ... : vector<2x2xf32>
+      %1 = vector.transfer_read ... : vector<2x1xf32>
+      %2 = vector.transfer_read ... : vector<2x2xf32>
+      %3 = vector.transfer_read ... : vector<2x1xf32>
+
+      %4 = vector.tuple %0, %1, %2, %3
+        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>
+
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    TupleType getResultTupleType() {
+      return getResult().getType().cast<TupleType>();
+    }
+  }];
+}
+
+def Vector_TupleGetOp :
+  Vector_Op<"tuple_get", [NoSideEffect]>,
+    Arguments<(ins TupleOf<[AnyVector]>:$vectors, APIntAttr:$index)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector tuple get operation";
+  let description = [{
+    Returns the tuple element of 'vectors' at 'index'.
+
+    Note that this operation is used during the vector op unrolling
+    transformation and should be removed before lowering to lower-level
+    dialects.
+
+    Examples:
+    ```
+      %4 = vector.tuple %0, %1, %2, %3
+        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>>
+
+      %5 = vector.tuple_get %4, 1
+        : tuple<vector<2x2xf32>, vector<2x1xf32>,
+                vector<2x2xf32>, vector<2x1xf32>>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getResultVectorType() {
+      return getResult().getType().cast<VectorType>();
+    }
+    int64_t getIndex() {
+      return getAttrOfType<IntegerAttr>("index").getValue().getSExtValue();
+    }
+    static StringRef getIndexAttrName() { return "index"; }
+  }];
+  let hasFolder = 1;
+}
+
+def Vector_PrintOp :
+  Vector_Op<"print", []>, Arguments<(ins AnyType:$source)> {
+  let summary = "print operation (for testing and debugging)";
+  let description = [{
+    Prints the source vector (or scalar) to stdout in human readable
+    format (for testing and debugging). No return value.
+
+    Examples:
+    ```
+      %0 = constant 0.0 : f32
+      %1 = vector.broadcast %0 : f32 to vector<4xf32>
+      vector.print %1 : vector<4xf32>
+
+      when lowered to LLVM, the vector print is unrolled into
+      elementary printing method calls that at runtime will yield
+
+      ( 0.0, 0.0, 0.0, 0.0 )
+
+      on stdout when linked with a small runtime support library,
+      which only needs to provide a few printing methods (single
+      value for all data types, opening/closing bracket, comma,
+      newline).
+    ```
+  }];
+  let verifier = ?;
+  let extraClassDeclaration = [{
+    Type getPrintType() {
+      return source().getType();
+    }
+  }];
+  let assemblyFormat = "$source attr-dict `:` type($source)";
+}
+
+//===----------------------------------------------------------------------===//
+// Ops used for supporting progressive lowering and conversion type changes.
+//===----------------------------------------------------------------------===//
+
+/// Vector dialect matrix multiplication op that operates on flattened 1-D
+/// MLIR vectors. This is the counterpart of llvm.matrix.multiply in MLIR.
+/// This may seem redundant with vector.contract but it serves the purposes of
+/// more progressive lowering and localized type conversion on the path:
+///   `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`.
+def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect,
+        PredOpTrait<"lhs operand and result have same element type",
+                    TCresVTEtIsSameAsOpBase<0, 0>>,
+        PredOpTrait<"rhs operand and result have same element type",
+                    TCresVTEtIsSameAsOpBase<0, 1>>]>,
+      Arguments<(
+        // TODO(ntv, fhahn): tighten vector element types that make sense.
+        ins VectorOfRankAndType<[1],
+              [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$lhs,
+            VectorOfRankAndType<[1],
+              [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$rhs,
+            I32Attr:$lhs_rows, I32Attr:$lhs_columns, I32Attr:$rhs_columns)>,
+      Results<(
+        outs VectorOfRankAndType<[1],
+               [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$res)>
+{
+  let summary = "Vector matrix multiplication op that operates on flattened 1-D"
+    " MLIR vectors";
+  let description = [{
+    This is the counterpart of llvm.matrix.multiply in MLIR. It serves the
+    purposes of more progressive lowering and localized type conversion.
+
+    The ‘vector.matrix_multiply’ op treats `lhs` as matrix with <lhs_rows> rows
+    and <lhs_columns> columns, `rhs` as matrix with <lhs_columns> rows and
+    <rhs_columns> and multiplies them. The result matrix is returned embedded in
+    the result vector.
+
+    Example:
+
+    ```
+      %C = vector.matrix_multiply %A, %B
+        { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } :
+        (vector<64xf64>, vector<48xf64>) -> vector<12xf64>
+    ```
+  }];
+  let builders = [
+   OpBuilder<"Builder *builder, OperationState &result, Value lhs, Value rhs, "
+             "unsigned lhsRows, unsigned lhsColumns, unsigned rhsColumns",
+   [{
+     result.addOperands({lhs, rhs});
+     result.addAttribute("lhs_rows", builder->getI32IntegerAttr(lhsRows));
+     result.addAttribute("lhs_columns", builder->getI32IntegerAttr(lhsColumns));
+     result.addAttribute("rhs_columns", builder->getI32IntegerAttr(rhsColumns));
+     result.addTypes(VectorType::get(lhsRows * lhsColumns,
+       lhs.getType().cast<VectorType>().getElementType()));
+   }]>,
+  ];
+  let verifier = ?;
+  let assemblyFormat = "$lhs `,` $rhs attr-dict "
+    "`:` `(` type($lhs) `,` type($rhs) `)` `->` type($res)";
+}
+
+#endif // VECTOR_OPS

diff  --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 9a14a1586c7f..aba4556e4bc1 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -15,7 +15,7 @@
 #define MLIR_INITALLDIALECTS_H_
 
 #include "mlir/Dialect/AVX512/AVX512Dialect.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h"
@@ -39,7 +39,7 @@ namespace mlir {
 // all the possible dialects to be made available to the context automatically.
 inline void registerAllDialects() {
   static bool init_once = []() {
-    registerDialect<AffineOpsDialect>();
+    registerDialect<AffineDialect>();
     registerDialect<avx512::AVX512Dialect>();
     registerDialect<fxpmath::FxpMathOpsDialect>();
     registerDialect<gpu::GPUDialect>();

diff  --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index c1cac45816df..8a23292fc964 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -25,6 +25,8 @@
 #include "mlir/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.h"
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/FxpMathOps/Passes.h"
 #include "mlir/Dialect/FxpMathOps/Passes.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h"

diff  --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 8b7495ec0e58..8515bfd173ea 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -53,11 +53,6 @@ std::unique_ptr<OpPassBase<FuncOp>> createLoopUnrollPass(
 std::unique_ptr<OpPassBase<FuncOp>>
 createLoopUnrollAndJamPass(int unrollJamFactor = -1);
 
-/// Creates a simplification pass for affine structures (maps and sets). In
-/// addition, this pass also normalizes memrefs to have the trivial (identity)
-/// layout map.
-std::unique_ptr<OpPassBase<FuncOp>> createSimplifyAffineStructuresPass();
-
 /// Creates a loop fusion pass which fuses loops. Buffers of size less than or
 /// equal to `localBufSizeThreshold` are promoted to memory space
 /// `fastMemorySpace'.
@@ -91,14 +86,6 @@ createLoopTilingPass(uint64_t cacheSizeBytes);
 /// bounds into a single loop.
 std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
 
-/// Performs packing (or explicit copying) of accessed memref regions into
-/// buffers in the specified faster memory space through either pointwise copies
-/// or DMA operations.
-std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass(
-    unsigned slowMemorySpace, unsigned fastMemorySpace,
-    unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
-    uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
-
 /// Creates a pass to perform optimizations relying on memref dataflow such as
 /// store to load forwarding, elimination of dead stores, and dead allocs.
 std::unique_ptr<OpPassBase<FuncOp>> createMemRefDataFlowOptPass();

diff  --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index 6f1a21d0e5d0..56a0d72bb387 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -14,8 +14,8 @@
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/Builders.h"

diff  --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
index 0b75767d6210..3448443c3016 100644
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/AffineStructures.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/IntegerSet.h"

diff  --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
index f9c0236e3d4e..262bc7e8a588 100644
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -24,7 +24,7 @@ add_mlir_library(MLIRAnalysis
 
 target_link_libraries(MLIRAnalysis
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRCallInterfaces
   MLIRControlFlowInterfaces
   MLIRInferTypeOpInterface
@@ -44,7 +44,7 @@ add_mlir_library(MLIRLoopAnalysis
 
 target_link_libraries(MLIRLoopAnalysis
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRCallInterfaces
   MLIRControlFlowInterfaces
   MLIRInferTypeOpInterface

diff  --git a/mlir/lib/Analysis/LoopAnalysis.cpp b/mlir/lib/Analysis/LoopAnalysis.cpp
index b5ff91b44062..9cf8fbc815d2 100644
--- a/mlir/lib/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Analysis/LoopAnalysis.cpp
@@ -15,8 +15,8 @@
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/NestedMatcher.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"

diff  --git a/mlir/lib/Analysis/NestedMatcher.cpp b/mlir/lib/Analysis/NestedMatcher.cpp
index 2324bbcf7b56..807e5df46d1c 100644
--- a/mlir/lib/Analysis/NestedMatcher.cpp
+++ b/mlir/lib/Analysis/NestedMatcher.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/NestedMatcher.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 
 #include "llvm/ADT/ArrayRef.h"

diff  --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index fae41b4a027b..b1e45d1cfe7b 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Operation.h"

diff  --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index 7b3cd58aa980..940e07630e20 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -14,8 +14,8 @@
 #include "mlir/Analysis/Utils.h"
 
 #include "mlir/Analysis/AffineAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"

diff  --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index 9c100a280a64..bb434abdc26c 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -13,7 +13,7 @@
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExprVisitor.h"

diff  --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
index aa916739697e..5613b28e3418 100644
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -7,7 +7,7 @@ add_mlir_conversion_library(MLIRAffineToStandard
 target_link_libraries(
   MLIRAffineToStandard
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRLoopOps
   MLIRPass
   MLIRStandardOps

diff  --git a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
index dd69af418bde..9a460bcf7165 100644
--- a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
@@ -7,7 +7,7 @@ add_mlir_conversion_library(MLIRLoopsToGPU
 )
 target_link_libraries(MLIRLoopsToGPU
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineToStandard
   MLIRGPU
   MLIRIR

diff  --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index 8023226bc300..ec5c9cff2ee0 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"

diff  --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
index 9a703199cba1..264e704f914f 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -8,7 +8,7 @@
 
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -123,7 +123,7 @@ struct ParallelLoopToGpuPass : public OperationPass<ParallelLoopToGpuPass> {
     populateParallelLoopToGPUPatterns(patterns, &getContext());
     ConversionTarget target(getContext());
     target.addLegalDialect<StandardOpsDialect>();
-    target.addLegalDialect<AffineOpsDialect>();
+    target.addLegalDialect<AffineDialect>();
     target.addLegalDialect<gpu::GPUDialect>();
     target.addLegalDialect<loop::LoopOpsDialect>();
     target.addIllegalOp<loop::ParallelOp>();

diff  --git a/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
index b16f02ef6b9c..b73d94562edc 100644
--- a/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
+++ b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
@@ -13,7 +13,7 @@
 #include <type_traits>
 
 #include "mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h"
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/LoopOps/EDSC/Builders.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Vector/VectorOps.h"

diff  --git a/mlir/lib/Dialect/Affine/CMakeLists.txt b/mlir/lib/Dialect/Affine/CMakeLists.txt
new file mode 100644
index 000000000000..c018b50f967f
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_mlir_dialect_library(MLIRAffine
+  IR/AffineOps.cpp
+  IR/AffineValueMap.cpp
+  EDSC/Builders.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
+
+  DEPENDS
+  MLIRAffineOpsIncGen
+  )
+target_link_libraries(MLIRAffine
+  PUBLIC
+  MLIREDSC
+  MLIRIR
+  MLIRLoopLikeInterface
+  MLIRSideEffects
+  MLIRStandardOps
+  )
+
+add_subdirectory(Transforms)

diff  --git a/mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp
similarity index 99%
rename from mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp
rename to mlir/lib/Dialect/Affine/EDSC/Builders.cpp
index e69f3d6c8c7a..06f88dcec1be 100644
--- a/mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp
+++ b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/EDSC/Builders.h"
+#include "mlir/Dialect/Affine/EDSC/Builders.h"
 #include "mlir/Dialect/StandardOps/EDSC/Builders.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"

diff  --git a/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
new file mode 100644
index 000000000000..751bfd351bc6
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_dialect_library(MLIRAffine
+  EDSC/Builders.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
+
+  DEPENDS
+  MLIRAffineOpsIncGen
+  )
+target_link_libraries(MLIRAffine
+  PUBLIC
+  MLIREDSC
+  MLIRIR
+  MLIRLoopLikeInterface
+  MLIRSideEffects
+  MLIRStandardOps
+  )

diff  --git a/mlir/lib/Dialect/AffineOps/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
similarity index 99%
rename from mlir/lib/Dialect/AffineOps/AffineOps.cpp
rename to mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 0b8795947e06..9e1f7e96bb0b 100644
--- a/mlir/lib/Dialect/AffineOps/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/IntegerSet.h"
@@ -25,7 +25,7 @@ using llvm::dbgs;
 #define DEBUG_TYPE "affine-analysis"
 
 //===----------------------------------------------------------------------===//
-// AffineOpsDialect Interfaces
+// AffineDialect Interfaces
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -64,21 +64,21 @@ struct AffineInlinerInterface : public DialectInlinerInterface {
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
-// AffineOpsDialect
+// AffineDialect
 //===----------------------------------------------------------------------===//
 
-AffineOpsDialect::AffineOpsDialect(MLIRContext *context)
+AffineDialect::AffineDialect(MLIRContext *context)
     : Dialect(getDialectNamespace(), context) {
   addOperations<AffineDmaStartOp, AffineDmaWaitOp, AffineLoadOp, AffineStoreOp,
 #define GET_OP_LIST
-#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
+#include "mlir/Dialect/Affine/IR/AffineOps.cpp.inc"
                 >();
   addInterfaces<AffineInlinerInterface>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
 /// the desired resultant type.
-Operation *AffineOpsDialect::materializeConstant(OpBuilder &builder,
+Operation *AffineDialect::materializeConstant(OpBuilder &builder,
                                                  Attribute value, Type type,
                                                  Location loc) {
   return builder.create<ConstantOp>(loc, type, value);
@@ -2369,4 +2369,4 @@ static ParseResult parseAffineParallelOp(OpAsmParser &parser,
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
+#include "mlir/Dialect/Affine/IR/AffineOps.cpp.inc"

diff  --git a/mlir/lib/Dialect/AffineOps/AffineValueMap.cpp b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
similarity index 97%
rename from mlir/lib/Dialect/AffineOps/AffineValueMap.cpp
rename to mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
index bac183505a71..c17f59323a7f 100644
--- a/mlir/lib/Dialect/AffineOps/AffineValueMap.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/AffineValueMap.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 
 using namespace mlir;
 

diff  --git a/mlir/lib/Dialect/AffineOps/CMakeLists.txt b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
similarity index 57%
rename from mlir/lib/Dialect/AffineOps/CMakeLists.txt
rename to mlir/lib/Dialect/Affine/IR/CMakeLists.txt
index bf490a5c9795..91dcceaf3912 100644
--- a/mlir/lib/Dialect/AffineOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
@@ -1,15 +1,14 @@
-add_mlir_dialect_library(MLIRAffineOps
+add_mlir_dialect_library(MLIRAffine
   AffineOps.cpp
   AffineValueMap.cpp
-  EDSC/Builders.cpp
 
   ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AffineOps
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
 
   DEPENDS
   MLIRAffineOpsIncGen
   )
-target_link_libraries(MLIRAffineOps
+target_link_libraries(MLIRAffine
   PUBLIC
   MLIREDSC
   MLIRIR

diff  --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
similarity index 99%
rename from mlir/lib/Transforms/AffineDataCopyGeneration.cpp
rename to mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
index 5409c557da83..4f6d453fb56b 100644
--- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -20,11 +20,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Support/CommandLine.h"

diff  --git a/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
similarity index 99%
rename from mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp
rename to mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
index a8ea4b9c1e10..1616b87c4cdc 100644
--- a/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"

diff  --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000..d7e42b5489fa
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_mlir_dialect_library(MLIRAffineTransforms
+  AffineDataCopyGeneration.cpp
+  AffineLoopInvariantCodeMotion.cpp
+  SimplifyAffineStructures.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
+
+  DEPENDS
+  MLIRAffineOpsIncGen
+  MLIRLoopLikeInterfaceIncGen
+  )
+target_link_libraries(MLIRAffineTransforms
+  PUBLIC
+  MLIRAffine
+  MLIREDSC
+  MLIRIR
+  MLIRSideEffects
+  MLIRStandardOps
+  )
+

diff  --git a/mlir/lib/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
similarity index 95%
rename from mlir/lib/Transforms/SimplifyAffineStructures.cpp
rename to mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
index 671b3eab18df..60ad1545d350 100644
--- a/mlir/lib/Transforms/SimplifyAffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
@@ -13,7 +13,10 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Transforms/Utils.h"
 
 #define DEBUG_TYPE "simplify-affine-structure"

diff  --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt
index 0bcc794894cc..ddc147fe2657 100644
--- a/mlir/lib/Dialect/CMakeLists.txt
+++ b/mlir/lib/Dialect/CMakeLists.txt
@@ -1,5 +1,5 @@
+add_subdirectory(Affine)
 add_subdirectory(AVX512)
-add_subdirectory(AffineOps)
 add_subdirectory(FxpMathOps)
 add_subdirectory(GPU)
 add_subdirectory(Linalg)

diff  --git a/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
index e2c64f050158..198c7fc698dd 100644
--- a/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
+++ b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/Builders.h"
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/LoopOps/EDSC/Builders.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"

diff  --git a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
index 85ecf2802b25..bc9e244d4ad5 100644
--- a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
@@ -12,7 +12,7 @@ target_link_libraries(MLIRLinalgEDSC
   PUBLIC
   MLIREDSC
   MLIRIR
-  MLIRAffineOps
+  MLIRAffine
   MLIRLinalgOps
   MLIRLoopOps
   MLIRStandardOps

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 82e36f8a00b1..349c2d6980ac 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -14,7 +14,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   )
 target_link_libraries(MLIRLinalgTransforms
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIREDSC
   MLIRIR

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
index 316a5a75617a..5e1748cc47c0 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 54a4290e6e36..bee3f0dff0d2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index cabdd7497caf..7b4a5c651d24 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"

diff  --git a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
index 681a47d31271..f9ad613f2a17 100644
--- a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
@@ -9,7 +9,7 @@ add_mlir_dialect_library(MLIRLinalgUtils
 
 target_link_libraries(MLIRLinalgUtils
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIREDSC
   MLIRIR
   MLIRLinalgOps

diff  --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 9cc6aa48c966..c572be4d132e 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"

diff  --git a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
index 1b6e6d232711..2ec44b472298 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
@@ -8,7 +8,7 @@ add_mlir_dialect_library(MLIRLoopOpsTransforms
   )
 target_link_libraries(MLIRLoopOpsTransforms
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRIR
   MLIRPass
   MLIRLoopOps

diff  --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
index 8cb49f3428d6..c692c0174f0c 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/LoopOps/Passes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"

diff  --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
index 6bced3761afb..85fd241cee7e 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/LoopOps/Passes.h"
 #include "mlir/Dialect/LoopOps/Transforms.h"

diff  --git a/mlir/lib/Dialect/Vector/CMakeLists.txt b/mlir/lib/Dialect/Vector/CMakeLists.txt
index 3e1d8de0d3ba..e5e1251768cd 100644
--- a/mlir/lib/Dialect/Vector/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/CMakeLists.txt
@@ -16,7 +16,7 @@ target_link_libraries(MLIRVector
   MLIREDSC
   MLIRIR
   MLIRStandardOps
-  MLIRAffineOps
+  MLIRAffine
   MLIRLoopOps
   MLIRLoopAnalysis
   MLIRSideEffects

diff  --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index dd47e0c80dc1..6e54e5b05fb6 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -12,7 +12,7 @@
 
 #include <type_traits>
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/VectorOps.h"

diff  --git a/mlir/lib/Dialect/Vector/VectorUtils.cpp b/mlir/lib/Dialect/Vector/VectorUtils.cpp
index 1cace25b9835..f929dddd6d8d 100644
--- a/mlir/lib/Dialect/Vector/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/VectorUtils.cpp
@@ -12,7 +12,7 @@
 
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/Analysis/LoopAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/Builders.h"

diff  --git a/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
new file mode 100644
index 000000000000..e853c76d0dba
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
@@ -0,0 +1,1349 @@
+//===- VectorToLoops.cpp - Conversion within the Vector dialect -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites as 1->N patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/Dialect/VectorOps/VectorTransforms.h"
+#include "mlir/Dialect/VectorOps/VectorUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "vector-to-vector"
+
+using namespace mlir;
+using llvm::dbgs;
+using mlir::functional::zipMap;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<bool> lowerToLLVMMatrixIntrinsics(
+    "vector-lower-matrix-intrinsics",
+    llvm::cl::desc("Lower vector.contract to llvm.intr.matrix.multiply"),
+    llvm::cl::init(false), llvm::cl::cat(clOptionsCategory));
+
+/// Given a shape with sizes greater than 0 along all dimensions,
+/// returns the distance, in number of elements, between a slice in a dimension
+/// and the next slice in the same dimension.
+///   e.g. shape[3, 4, 5] -> linearization_basis[20, 5, 1]
+static SmallVector<int64_t, 8> computeStrides(ArrayRef<int64_t> shape) {
+  if (shape.empty())
+    return {};
+  SmallVector<int64_t, 8> tmp;
+  tmp.reserve(shape.size());
+  int64_t running = 1;
+  for (auto size : llvm::reverse(shape)) {
+    assert(size > 0 && "size must be nonnegative");
+    tmp.push_back(running);
+    running *= size;
+  }
+  return SmallVector<int64_t, 8>(tmp.rbegin(), tmp.rend());
+}
+
+static int64_t computeMaxLinearIndex(ArrayRef<int64_t> basis) {
+  if (basis.empty())
+    return 0;
+  int64_t res = 1;
+  for (auto b : basis)
+    res *= b;
+  return res;
+}
+
+/// Computes and returns the linearized index of 'offsets' w.r.t. 'basis'.
+static int64_t linearize(ArrayRef<int64_t> offsets, ArrayRef<int64_t> basis) {
+  assert(offsets.size() == basis.size());
+  int64_t linearIndex = 0;
+  for (unsigned idx = 0, e = basis.size(); idx < e; ++idx)
+    linearIndex += offsets[idx] * basis[idx];
+  return linearIndex;
+}
+
+// Clones `op` into a new operations that takes `operands` and returns
+// `resultTypes`.
+static Operation *cloneOpWithOperandsAndTypes(PatternRewriter &builder,
+                                              Location loc, Operation *op,
+                                              ArrayRef<Value> operands,
+                                              ArrayRef<Type> resultTypes) {
+  OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,
+                     op->getAttrs());
+  return builder.createOperation(res);
+}
+
+// Populates 'resultElements[indexMap[i]]' with elements from 'inputElements[i]'
+// for each index 'i' in inputElements with a valid mapping in 'indexMap'.
+static void getMappedElements(const DenseMap<int64_t, int64_t> &indexMap,
+                              ArrayRef<int64_t> inputElements,
+                              SmallVectorImpl<int64_t> &resultElements) {
+  assert(indexMap.size() == resultElements.size());
+  assert(inputElements.size() >= resultElements.size());
+  for (unsigned i = 0, e = inputElements.size(); i < e; ++i) {
+    auto it = indexMap.find(i);
+    if (it != indexMap.end())
+      resultElements[it->second] = inputElements[i];
+  }
+}
+
+// Returns a tuple type with vector element types for each resulting slice
+// of 'vectorType' unrolled by 'sizes' and 'strides'.
+// TODO(andydavis) Move this to a utility function and share it with
+// Extract/InsertSlicesOp verification.
+static TupleType generateExtractSlicesOpResultType(VectorType vectorType,
+                                                   ArrayRef<int64_t> sizes,
+                                                   ArrayRef<int64_t> strides,
+                                                   PatternRewriter &builder) {
+  assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
+  assert(static_cast<int64_t>(sizes.size()) == vectorType.getRank());
+  assert(static_cast<int64_t>(strides.size()) == vectorType.getRank());
+
+  // Compute shape ratio of 'shape' and 'sizes'.
+  auto shape = vectorType.getShape();
+  auto maybeDimSliceCounts = shapeRatio(shape, sizes);
+  assert(maybeDimSliceCounts.hasValue());
+  auto sliceDimCounts = *maybeDimSliceCounts;
+
+  // Compute strides w.r.t number of slices in each dimension.
+  auto sliceStrides = computeStrides(sliceDimCounts);
+  int64_t sliceCount = computeMaxLinearIndex(sliceDimCounts);
+  SmallVector<Type, 4> vectorTypes(sliceCount);
+  for (unsigned i = 0; i < sliceCount; ++i) {
+    auto vectorOffsets = delinearize(sliceStrides, i);
+    auto elementOffsets =
+        computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
+    auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets);
+    // Create Vector type and add to 'vectorTypes[i]'.
+    vectorTypes[i] = VectorType::get(sliceSizes, vectorType.getElementType());
+  }
+  return TupleType::get(vectorTypes, builder.getContext());
+}
+
+// UnrolledVectorState aggregates per-operand/result vector state required for
+// unrolling.
+struct UnrolledVectorState {
+  SmallVector<int64_t, 4> unrolledShape;
+  SmallVector<int64_t, 4> unrollFactors;
+  SmallVector<int64_t, 8> basis;
+  int64_t numInstances;
+  Value slicesTuple;
+};
+
+// Populates 'state' with unrolled shape, unroll factors, basis and
+// num unrolled instances for 'vectorType'.
+static void initUnrolledVectorState(VectorType vectorType, Value initValue,
+                                    const DenseMap<int64_t, int64_t> &indexMap,
+                                    ArrayRef<int64_t> targetShape,
+                                    UnrolledVectorState &state,
+                                    PatternRewriter &builder) {
+  // Compute unrolled shape of 'vectorType'.
+  state.unrolledShape.resize(vectorType.getRank());
+  getMappedElements(indexMap, targetShape, state.unrolledShape);
+  // Compute unroll factors for unrolled shape.
+  auto maybeUnrollFactors =
+      shapeRatio(vectorType.getShape(), state.unrolledShape);
+  assert(maybeUnrollFactors.hasValue());
+  state.unrollFactors = *maybeUnrollFactors;
+  // Compute 'basis' and 'numInstances' based on 'state.unrollFactors'.
+  state.basis = computeStrides(state.unrollFactors);
+  state.numInstances = computeMaxLinearIndex(state.unrollFactors);
+  state.slicesTuple = nullptr;
+  if (initValue != nullptr) {
+    // Create ExtractSlicesOp.
+    SmallVector<int64_t, 4> sizes(state.unrolledShape);
+    SmallVector<int64_t, 4> strides(state.unrollFactors.size(), 1);
+    auto tupleType =
+        generateExtractSlicesOpResultType(vectorType, sizes, strides, builder);
+    state.slicesTuple = builder.create<vector::ExtractSlicesOp>(
+        initValue.getLoc(), tupleType, initValue, sizes, strides);
+  }
+}
+
+// Computes and returns the linear index of the unrolled vector at
+// 'vectorOffsets' within the vector represented by 'state'.
+static int64_t
+getUnrolledVectorLinearIndex(UnrolledVectorState &state,
+                             ArrayRef<int64_t> vectorOffsets,
+                             DenseMap<int64_t, int64_t> &indexMap) {
+  // Compute vector offsets.
+  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
+  getMappedElements(indexMap, vectorOffsets, sliceOffsets);
+  // Compute and return linear index of 'sliceOffsets' w.r.t 'state.basis'.
+  return linearize(sliceOffsets, state.basis);
+}
+
+// Returns an unrolled vector at 'vectorOffsets' within the vector
+// represented by 'state'. The vector is created from a slice of 'initValue'
+// if not present in 'cache'.
+static Value getOrCreateUnrolledVectorSlice(
+    Location loc, UnrolledVectorState &state, ArrayRef<int64_t> vectorOffsets,
+    ArrayRef<int64_t> offsets, DenseMap<int64_t, int64_t> &indexMap,
+    Value initValue, SmallVectorImpl<Value> &cache, PatternRewriter &builder) {
+  // Compute slice offsets.
+  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
+  getMappedElements(indexMap, offsets, sliceOffsets);
+  // TODO(b/144845578) Support non-1 strides.
+  SmallVector<int64_t, 4> sliceStrides(state.unrolledShape.size(), 1);
+  // Compute linear index of 'sliceOffsets' w.r.t 'state.basis'.
+  int64_t sliceLinearIndex =
+      getUnrolledVectorLinearIndex(state, vectorOffsets, indexMap);
+  assert(sliceLinearIndex < static_cast<int64_t>(cache.size()));
+  auto valueSlice = cache[sliceLinearIndex];
+  if (valueSlice == nullptr) {
+    // Return tuple element at 'sliceLinearIndex'.
+    auto tupleIndex = builder.getI64IntegerAttr(sliceLinearIndex);
+    auto initValueType = initValue.getType().cast<VectorType>();
+    auto vectorType =
+        VectorType::get(state.unrolledShape, initValueType.getElementType());
+    // Initialize 'cache' with slice from 'initValue'.
+    valueSlice = builder.create<vector::TupleGetOp>(
+        loc, vectorType, state.slicesTuple, tupleIndex);
+    // Store value back to 'cache'.
+    cache[sliceLinearIndex] = valueSlice;
+  }
+  return valueSlice;
+}
+
+// VectorState aggregates per-operand/result vector state required for
+// creating slices of vector operands, and clones of the operation being
+// unrolled.
+struct VectorState {
+  // The type of this vector.
+  VectorType type;
+  // Map from iteration space index to vector dimension index.
+  DenseMap<int64_t, int64_t> indexMap;
+  // Index of this value in operation's operand list (-1 if not an operand).
+  int64_t operandIndex = -1;
+  // Accumulator iterator flag.
+  bool isAcc = false;
+};
+
+//
+// unrollSingleResultStructuredOp
+//
+// Returns a value representing the result of structured operation 'op'
+// with iteration bounds 'iterationBounds' unrolled to 'targetShape'.
+// A list of VectorState objects must be specified in 'vectors', where
+// each VectorState in the list represents a vector operand or vector result
+// (if the operation does not have an accumulator operand).
+// The VectorState at index 'resultIndex' in the list must be the state
+// associated with the operations single result (i.e. either its accumulator
+// operand or vector result value).
+//
+// Example:
+//
+//  // Before unrolling
+//
+//   operand0                operand1                operand2
+//       \                      |                      /
+//        -------------------- opA --------------------
+//
+//  // After unrolling by 2
+//
+//   operand0                operand1                operand2
+//   /      \                /      \                /      \
+// slice00  slice01       slice10  slice11        slice20  slice21
+//   \         |            |          |            /          |
+//    -------------------- opA0 --------------------           |
+//             |            |          |                       |
+//              \           |          |                      /
+//               -------------------- opA1 -------------------
+//                          |          |
+//                           \        /
+//                           insertslice
+//                                |
+
+// TODO(andydavis) Add the following canonicalization/simplifcation patterns:
+// *) Add pattern which matches InsertStridedSlice -> StridedSlice and forwards
+//    InsertStridedSlice operand to StridedSlice.
+// *) Add pattern which matches SourceOp -> StridedSlice -> UserOp which checks
+//    if there are duplicate identical StridedSlice ops from SourceOp, and
+//    rewrites itself to use the first duplicate. This transformation should
+//    cause users of identifical StridedSlice ops to reuse the same StridedSlice
+//    operation, and leave the duplicate StridedSlice ops with no users
+//    (removable with DCE).
+
+// TODO(andydavis) Generalize this to support structured ops beyond
+// vector ContractionOp, and merge it with 'unrollSingleResultOpMatchingType'
+static Value unrollSingleResultStructuredOp(Operation *op,
+                                            ArrayRef<int64_t> iterationBounds,
+                                            std::vector<VectorState> &vectors,
+                                            unsigned resultIndex,
+                                            ArrayRef<int64_t> targetShape,
+                                            PatternRewriter &builder) {
+  auto shapedType = op->getResult(0).getType().dyn_cast_or_null<ShapedType>();
+  if (!shapedType || !shapedType.hasStaticShape())
+    assert(false && "Expected a statically shaped result type");
+
+  // Compute unroll factors for 'iterationBounds' based on 'targetShape'
+  auto maybeUnrollFactors = shapeRatio(iterationBounds, targetShape);
+  if (!maybeUnrollFactors.hasValue())
+    assert(false && "Failed to compute unroll factors for target shape");
+  auto unrollFactors = *maybeUnrollFactors;
+
+  // Compute unrolled vector state for each vector in 'vectors'.
+  unsigned numVectors = vectors.size();
+  SmallVector<UnrolledVectorState, 3> unrolledVectorState(numVectors);
+  for (unsigned i = 0; i < numVectors; ++i) {
+    int64_t operandIndex = vectors[i].operandIndex;
+    auto operand = operandIndex >= 0 ? op->getOperand(operandIndex) : nullptr;
+    initUnrolledVectorState(vectors[i].type, operand, vectors[i].indexMap,
+                            targetShape, unrolledVectorState[i], builder);
+  }
+  // Compute number of total unrolled instances.
+  auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors);
+  auto sliceStrides = computeStrides(unrollFactors);
+
+  auto &resultValueState = unrolledVectorState[resultIndex];
+  auto unrolledResultType = VectorType::get(resultValueState.unrolledShape,
+                                            shapedType.getElementType());
+
+  // Initialize caches for intermediate vector results.
+  std::vector<SmallVector<Value, 4>> caches(numVectors);
+  for (unsigned i = 0; i < numVectors; ++i)
+    caches[i].resize(unrolledVectorState[i].numInstances);
+
+  // Unroll 'numUnrolledInstances' of 'op', storing results in 'caches'.
+  for (unsigned i = 0; i < numUnrolledInstances; ++i) {
+    auto vectorOffsets = delinearize(sliceStrides, i);
+    auto elementOffsets =
+        computeElementOffsetsFromVectorSliceOffsets(targetShape, vectorOffsets);
+    // Get cached slice (or create slice) for each operand at 'offsets'.
+    SmallVector<Value, 3> operands;
+    operands.resize(op->getNumOperands());
+    for (unsigned i = 0; i < numVectors; ++i) {
+      int64_t operandIndex = vectors[i].operandIndex;
+      if (operandIndex < 0)
+        continue; // Output
+      auto operand = op->getOperand(operandIndex);
+      operands[operandIndex] = getOrCreateUnrolledVectorSlice(
+          op->getLoc(), unrolledVectorState[i], vectorOffsets, elementOffsets,
+          vectors[i].indexMap, operand, caches[i], builder);
+    }
+    // Create op on sliced vector arguments.
+    auto resultVector =
+        cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands,
+                                    unrolledResultType)
+            ->getResult(0);
+
+    // Compute linear result index.
+    int64_t linearIndex = getUnrolledVectorLinearIndex(
+        resultValueState, vectorOffsets, vectors[resultIndex].indexMap);
+    // Update result cache at 'linearIndex'.
+    caches[resultIndex][linearIndex] = resultVector;
+  }
+
+  // Create TupleOp of unrolled result vectors.
+  SmallVector<Type, 4> vectorTupleTypes(resultValueState.numInstances);
+  SmallVector<Value, 4> vectorTupleValues(resultValueState.numInstances);
+  for (unsigned i = 0; i < resultValueState.numInstances; ++i) {
+    vectorTupleTypes[i] = caches[resultIndex][i].getType().cast<VectorType>();
+    vectorTupleValues[i] = caches[resultIndex][i];
+  }
+  TupleType tupleType = builder.getTupleType(vectorTupleTypes);
+  Value tupleOp = builder.create<vector::TupleOp>(op->getLoc(), tupleType,
+                                                  vectorTupleValues);
+
+  // Create InsertSlicesOp(Tuple(result_vectors)).
+  auto resultVectorType = op->getResult(0).getType().cast<VectorType>();
+  SmallVector<int64_t, 4> sizes(resultValueState.unrolledShape);
+  SmallVector<int64_t, 4> strides(resultValueState.unrollFactors.size(), 1);
+
+  Value insertSlicesOp = builder.create<vector::InsertSlicesOp>(
+      op->getLoc(), resultVectorType, tupleOp, builder.getI64ArrayAttr(sizes),
+      builder.getI64ArrayAttr(strides));
+  return insertSlicesOp;
+}
+
+static void getVectorContractionOpUnrollState(
+    vector::ContractionOp contractionOp, ArrayRef<int64_t> targetShape,
+    SmallVectorImpl<int64_t> &iterationBounds,
+    std::vector<VectorState> &vectors, unsigned &resultIndex) {
+  // Get contraction op iteration bounds.
+  contractionOp.getIterationBounds(iterationBounds);
+  assert(iterationBounds.size() == targetShape.size());
+  // Get map from iteration space index to lhs/rhs/result shape index.
+  std::vector<DenseMap<int64_t, int64_t>> iterationIndexMapList;
+  contractionOp.getIterationIndexMap(iterationIndexMapList);
+  unsigned numIterators = iterationIndexMapList.size();
+  vectors.resize(numIterators);
+  unsigned accOperandIndex = vector::ContractionOp::getAccOperandIndex();
+  for (unsigned i = 0; i < numIterators; ++i) {
+    vectors[i].type = contractionOp.getOperand(i).getType().cast<VectorType>();
+    vectors[i].indexMap = iterationIndexMapList[i];
+    vectors[i].operandIndex = i;
+    vectors[i].isAcc = i == accOperandIndex ? true : false;
+  }
+
+  if (llvm::size(contractionOp.masks()) == 2) {
+    // Add vectors for lhs/rhs vector mask arguments. Masks have the
+    // same vector shape lhs/rhs args, so copy their index maps.
+    vectors.push_back({contractionOp.getLHSVectorMaskType(),
+                       vectors[0].indexMap, accOperandIndex + 1, false});
+    vectors.push_back({contractionOp.getRHSVectorMaskType(),
+                       vectors[1].indexMap, accOperandIndex + 2, false});
+  }
+  // Unroll 'op' 'iterationBounds' to 'targetShape'.
+  // TODO(andydavis) Use linalg style 'args_in'/'args_out' to partition
+  // 'vectors' instead of 'resultIndex'.
+  resultIndex = accOperandIndex;
+}
+
+static void
+getVectorElementwiseOpUnrollState(Operation *op, ArrayRef<int64_t> targetShape,
+                                  SmallVectorImpl<int64_t> &iterationBounds,
+                                  std::vector<VectorState> &vectors,
+                                  unsigned &resultIndex) {
+  // Verify that operation and operands all have the same vector shape.
+  auto resultType = op->getResult(0).getType().dyn_cast_or_null<VectorType>();
+  assert(resultType && "Expected op with vector result type");
+  auto resultShape = resultType.getShape();
+  // Verify that all operands have the same vector type as result.
+  assert(llvm::all_of(op->getOperandTypes(),
+                      [=](Type type) { return type == resultType; }));
+  // Populate 'iterationBounds' with 'resultShape' for elementwise operations.
+  iterationBounds.assign(resultShape.begin(), resultShape.end());
+
+  // Create trivial elementwise identity index map based on 'resultShape'.
+  DenseMap<int64_t, int64_t> indexMap;
+  indexMap.reserve(resultShape.size());
+  for (unsigned i = 0; i < resultShape.size(); ++i)
+    indexMap[i] = i;
+
+  // Create VectorState each operand and single result.
+  unsigned numVectors = op->getNumOperands() + op->getNumResults();
+  vectors.resize(numVectors);
+  for (unsigned i = 0; i < op->getNumOperands(); ++i)
+    vectors[i] = {resultType, indexMap, i, false};
+  vectors[numVectors - 1] = {resultType, indexMap, -1, false};
+  resultIndex = numVectors - 1;
+}
+
+// Entry point for unrolling declarative pattern rewrites.
+SmallVector<Value, 1> mlir::vector::unrollSingleResultOpMatchingType(
+    PatternRewriter &builder, Operation *op, ArrayRef<int64_t> targetShape) {
+  assert(op->getNumResults() == 1 && "Expected single result operation");
+
+  // Populate 'iterationBounds', 'vectors' and 'resultIndex' to unroll 'op'.
+  SmallVector<int64_t, 6> iterationBounds;
+  std::vector<VectorState> vectors;
+  unsigned resultIndex;
+
+  if (auto contractionOp = dyn_cast<vector::ContractionOp>(op)) {
+    // Populate state for vector ContractionOp.
+    getVectorContractionOpUnrollState(contractionOp, targetShape,
+                                      iterationBounds, vectors, resultIndex);
+  } else {
+    // Populate state for vector elementwise op.
+    getVectorElementwiseOpUnrollState(op, targetShape, iterationBounds, vectors,
+                                      resultIndex);
+  }
+
+  // Unroll 'op' with 'iterationBounds' to 'targetShape'.
+  return SmallVector<Value, 1>{unrollSingleResultStructuredOp(
+      op, iterationBounds, vectors, resultIndex, targetShape, builder)};
+}
+
+/// Generates slices of 'vectorType' according to 'sizes' and 'strides, and
+/// calls 'fn' with linear index and indices for each slice.
+static void
+generateTransferOpSlices(Type memrefElementType, VectorType vectorType,
+                         TupleType tupleType, ArrayRef<int64_t> sizes,
+                         ArrayRef<int64_t> strides, ArrayRef<Value> indices,
+                         PatternRewriter &rewriter,
+                         function_ref<void(unsigned, ArrayRef<Value>)> fn) {
+  // Compute strides w.r.t. to slice counts in each dimension.
+  auto maybeDimSliceCounts = shapeRatio(vectorType.getShape(), sizes);
+  assert(maybeDimSliceCounts.hasValue());
+  auto sliceDimCounts = *maybeDimSliceCounts;
+  auto sliceStrides = computeStrides(sliceDimCounts);
+
+  int64_t numSlices = tupleType.size();
+  unsigned numSliceIndices = indices.size();
+  // Compute 'indexOffset' at which to update 'indices', which is equal
+  // to the memref rank (indices.size) minus the effective 'vectorRank'.
+  // The effective 'vectorRank', is equal to the rank of the vector type
+  // minus the rank of the memref vector element type (if it has one).
+  //
+  // For example:
+  //
+  //   Given memref type 'memref<6x2x1xvector<2x4xf32>>' and vector
+  //   transfer_read/write ops which read/write vectors of type
+  //   'vector<2x1x2x4xf32>'. The memref rank is 3, and the effective
+  //   vector rank is 4 - 2 = 2, and so 'indexOffset' = 3 - 2 = 1.
+  //
+  unsigned vectorRank = vectorType.getRank();
+  if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
+    assert(vectorRank >= memrefVectorElementType.getRank());
+    vectorRank -= memrefVectorElementType.getRank();
+  }
+  unsigned indexOffset = numSliceIndices - vectorRank;
+
+  auto *ctx = rewriter.getContext();
+  for (unsigned i = 0; i < numSlices; ++i) {
+    auto vectorOffsets = delinearize(sliceStrides, i);
+    auto elementOffsets =
+        computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
+    // Compute 'sliceIndices' by adding 'sliceOffsets[i]' to 'indices[i]'.
+    SmallVector<Value, 4> sliceIndices(numSliceIndices);
+    for (unsigned j = 0; j < numSliceIndices; ++j) {
+      if (j < indexOffset) {
+        sliceIndices[j] = indices[j];
+      } else {
+        auto expr = getAffineDimExpr(0, ctx) +
+                    getAffineConstantExpr(elementOffsets[j - indexOffset], ctx);
+        auto map = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, expr);
+        sliceIndices[j] = rewriter.create<AffineApplyOp>(
+            indices[j].getLoc(), map, ArrayRef<Value>(indices[j]));
+      }
+    }
+    // Call 'fn' to generate slice 'i' at 'sliceIndices'.
+    fn(i, sliceIndices);
+  }
+}
+
+/// Returns true if 'map' is a suffix of an identity affine map, false
+/// otherwise. Example: affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+static bool isIdentitySuffix(AffineMap map) {
+  if (map.getNumDims() < map.getNumResults())
+    return false;
+  ArrayRef<AffineExpr> results = map.getResults();
+  Optional<int> lastPos;
+  for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) {
+    auto expr = results[i].dyn_cast<AffineDimExpr>();
+    if (!expr)
+      return false;
+    int currPos = static_cast<int>(expr.getPosition());
+    if (lastPos.hasValue() && currPos != lastPos.getValue() + 1)
+      return false;
+    lastPos = currPos;
+  }
+  return true;
+}
+
+namespace {
+
+// Splits vector TransferReadOp into smaller TransferReadOps based on slicing
+// scheme of its unique ExtractSlicesOp user.
+struct SplitTransferReadOp : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TransferReadOp xferReadOp,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(andydavis, ntv) Support splitting TransferReadOp with non-identity
+    // permutation maps. Repurpose code from MaterializeVectors transformation.
+    if (!isIdentitySuffix(xferReadOp.permutation_map()))
+      return matchFailure();
+    // Return unless the unique 'xferReadOp' user is an ExtractSlicesOp.
+    Value xferReadResult = xferReadOp.getResult();
+    auto extractSlicesOp =
+        dyn_cast<vector::ExtractSlicesOp>(*xferReadResult.getUsers().begin());
+    if (!xferReadResult.hasOneUse() || !extractSlicesOp)
+      return matchFailure();
+
+    // Get 'sizes' and 'strides' parameters from ExtractSlicesOp user.
+    auto sourceVectorType = extractSlicesOp.getSourceVectorType();
+    auto resultTupleType = extractSlicesOp.getResultTupleType();
+    SmallVector<int64_t, 4> sizes;
+    extractSlicesOp.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    extractSlicesOp.getStrides(strides);
+    assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
+
+    Location loc = xferReadOp.getLoc();
+    auto memrefElementType =
+        xferReadOp.memref().getType().cast<MemRefType>().getElementType();
+    int64_t numSlices = resultTupleType.size();
+    SmallVector<Value, 4> vectorTupleValues(numSlices);
+    SmallVector<Value, 4> indices(xferReadOp.indices().begin(),
+                                  xferReadOp.indices().end());
+    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
+      // Get VectorType for slice 'i'.
+      auto sliceVectorType = resultTupleType.getType(index);
+      // Create split TransferReadOp for 'sliceUser'.
+      vectorTupleValues[index] = rewriter.create<vector::TransferReadOp>(
+          loc, sliceVectorType, xferReadOp.memref(), sliceIndices,
+          xferReadOp.permutation_map(), xferReadOp.padding());
+    };
+    generateTransferOpSlices(memrefElementType, sourceVectorType,
+                             resultTupleType, sizes, strides, indices, rewriter,
+                             createSlice);
+
+    // Create tuple of splice xfer read operations.
+    Value tupleOp = rewriter.create<vector::TupleOp>(loc, resultTupleType,
+                                                     vectorTupleValues);
+    // Replace 'xferReadOp' with result 'insertSlicesResult'.
+    rewriter.replaceOpWithNewOp<vector::InsertSlicesOp>(
+        xferReadOp, sourceVectorType, tupleOp, extractSlicesOp.sizes(),
+        extractSlicesOp.strides());
+    return matchSuccess();
+  }
+};
+
+// Splits vector TransferWriteOp into smaller TransferWriteOps for each source.
+struct SplitTransferWriteOp : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TransferWriteOp xferWriteOp,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(andydavis, ntv) Support splitting TransferWriteOp with non-identity
+    // permutation maps. Repurpose code from MaterializeVectors transformation.
+    if (!isIdentitySuffix(xferWriteOp.permutation_map()))
+      return matchFailure();
+    // Return unless the 'xferWriteOp' 'vector' operand is an 'InsertSlicesOp'.
+    auto *vectorDefOp = xferWriteOp.vector().getDefiningOp();
+    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(vectorDefOp);
+    if (!insertSlicesOp)
+      return matchFailure();
+
+    // Get TupleOp operand of 'insertSlicesOp'.
+    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
+        insertSlicesOp.vectors().getDefiningOp());
+    if (!tupleOp)
+      return matchFailure();
+
+    // Get 'sizes' and 'strides' parameters from InsertSlicesOp user.
+    auto sourceTupleType = insertSlicesOp.getSourceTupleType();
+    auto resultVectorType = insertSlicesOp.getResultVectorType();
+    SmallVector<int64_t, 4> sizes;
+    insertSlicesOp.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    insertSlicesOp.getStrides(strides);
+
+    Location loc = xferWriteOp.getLoc();
+    auto memrefElementType =
+        xferWriteOp.memref().getType().cast<MemRefType>().getElementType();
+    SmallVector<Value, 4> indices(xferWriteOp.indices().begin(),
+                                  xferWriteOp.indices().end());
+    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
+      // Create split TransferWriteOp for source vector 'tupleOp.operand[i]'.
+      rewriter.create<vector::TransferWriteOp>(
+          loc, tupleOp.getOperand(index), xferWriteOp.memref(), sliceIndices,
+          xferWriteOp.permutation_map());
+    };
+    generateTransferOpSlices(memrefElementType, resultVectorType,
+                             sourceTupleType, sizes, strides, indices, rewriter,
+                             createSlice);
+
+    // Erase old 'xferWriteOp'.
+    rewriter.eraseOp(xferWriteOp);
+    return matchSuccess();
+  }
+};
+
+/// Decomposes ShapeCastOp on tuple-of-vectors to multiple ShapeCastOps, each
+/// on vector types.
+struct ShapeCastOpDecomposer : public OpRewritePattern<vector::ShapeCastOp> {
+  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
+                                     PatternRewriter &rewriter) const override {
+    // Check if 'shapeCastOp' has tuple source/result type.
+    auto sourceTupleType =
+        shapeCastOp.source().getType().dyn_cast_or_null<TupleType>();
+    auto resultTupleType =
+        shapeCastOp.result().getType().dyn_cast_or_null<TupleType>();
+    if (!sourceTupleType || !resultTupleType)
+      return matchFailure();
+    assert(sourceTupleType.size() == resultTupleType.size());
+
+    // Create single-vector ShapeCastOp for each source tuple element.
+    Location loc = shapeCastOp.getLoc();
+    SmallVector<Value, 8> resultElements;
+    resultElements.reserve(resultTupleType.size());
+    for (unsigned i = 0, e = sourceTupleType.size(); i < e; ++i) {
+      auto sourceElement = rewriter.create<vector::TupleGetOp>(
+          loc, sourceTupleType.getType(i), shapeCastOp.source(),
+          rewriter.getI64IntegerAttr(i));
+      resultElements.push_back(rewriter.create<vector::ShapeCastOp>(
+          loc, resultTupleType.getType(i), sourceElement));
+    }
+
+    // Replace 'shapeCastOp' with tuple of 'resultElements'.
+    rewriter.replaceOpWithNewOp<vector::TupleOp>(shapeCastOp, resultTupleType,
+                                                 resultElements);
+    return matchSuccess();
+  }
+};
+
+/// ShapeCastOpFolder folds cancelling ShapeCastOps away.
+//
+// Example:
+//
+//  The following MLIR with cancelling ShapeCastOps:
+//
+//   %0 = source : vector<5x4x2xf32>
+//   %1 = shape_cast %0 : vector<5x4x2xf32> to vector<20x2xf32>
+//   %2 = shape_cast %1 : vector<20x2xf32> to vector<5x4x2xf32>
+//   %3 = user %2 : vector<5x4x2xf32>
+//
+//  Should canonicalize to the following:
+//
+//   %0 = source : vector<5x4x2xf32>
+//   %1 = user %0 : vector<5x4x2xf32>
+//
+struct ShapeCastOpFolder : public OpRewritePattern<vector::ShapeCastOp> {
+  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
+                                     PatternRewriter &rewriter) const override {
+    // Check if 'shapeCastOp' has vector source/result type.
+    auto sourceVectorType =
+        shapeCastOp.source().getType().dyn_cast_or_null<VectorType>();
+    auto resultVectorType =
+        shapeCastOp.result().getType().dyn_cast_or_null<VectorType>();
+    if (!sourceVectorType || !resultVectorType)
+      return matchFailure();
+
+    // Check if shape cast op source operand is also a shape cast op.
+    auto sourceShapeCastOp = dyn_cast_or_null<vector::ShapeCastOp>(
+        shapeCastOp.source().getDefiningOp());
+    if (!sourceShapeCastOp)
+      return matchFailure();
+    auto operandSourceVectorType =
+        sourceShapeCastOp.source().getType().cast<VectorType>();
+    auto operandResultVectorType =
+        sourceShapeCastOp.result().getType().cast<VectorType>();
+
+    // Check if shape cast operations invert each other.
+    if (operandSourceVectorType != resultVectorType ||
+        operandResultVectorType != sourceVectorType)
+      return matchFailure();
+
+    rewriter.replaceOp(shapeCastOp, sourceShapeCastOp.source());
+    return matchSuccess();
+  }
+};
+
+// Patter rewrite which forward tuple elements to their users.
+// User(TupleGetOp(ExtractSlicesOp(InsertSlicesOp(TupleOp(Producer)))))
+//   -> User(Producer)
+struct TupleGetFolderOp : public OpRewritePattern<vector::TupleGetOp> {
+  using OpRewritePattern<vector::TupleGetOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TupleGetOp tupleGetOp,
+                                     PatternRewriter &rewriter) const override {
+    // Return if 'tupleGetOp.vectors' arg was not defined by ExtractSlicesOp.
+    auto extractSlicesOp = dyn_cast_or_null<vector::ExtractSlicesOp>(
+        tupleGetOp.vectors().getDefiningOp());
+    if (!extractSlicesOp)
+      return matchFailure();
+
+    // Return if 'extractSlicesOp.vector' arg was not defined by InsertSlicesOp.
+    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(
+        extractSlicesOp.vector().getDefiningOp());
+    if (!insertSlicesOp)
+      return matchFailure();
+
+    // Return if 'insertSlicesOp.vectors' arg was not defined by TupleOp.
+    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
+        insertSlicesOp.vectors().getDefiningOp());
+    if (!tupleOp)
+      return matchFailure();
+
+    // Forward Value from 'tupleOp' at 'tupleGetOp.index'.
+    Value tupleValue = tupleOp.getOperand(tupleGetOp.getIndex());
+    rewriter.replaceOp(tupleGetOp, tupleValue);
+    return matchSuccess();
+  }
+};
+
+/// Progressive lowering of ExtractSlicesOp to tuple of StridedSliceOp.
+/// One:
+///   %x = vector.extract_slices %0
+/// is replaced by:
+///   %a = vector.strided_slice %0
+///   %b = vector.strided_slice %0
+///   ..
+///   %x = vector.tuple %a, %b, ..
+class ExtractSlicesOpLowering
+    : public OpRewritePattern<vector::ExtractSlicesOp> {
+public:
+  using OpRewritePattern<vector::ExtractSlicesOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ExtractSlicesOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    VectorType vectorType = op.getSourceVectorType();
+    auto shape = vectorType.getShape();
+
+    SmallVector<int64_t, 4> sizes;
+    op.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    op.getStrides(strides); // all-ones at the moment
+
+    // For each element in the tuple, generate the proper strided slice.
+    TupleType tupleType = op.getResultTupleType();
+    int64_t tupleSize = tupleType.size();
+    SmallVector<Value, 4> tupleValues(tupleSize);
+    auto sliceStrides = computeStrides(shape, sizes);
+    for (int64_t i = 0; i < tupleSize; ++i) {
+      auto vectorOffsets = delinearize(sliceStrides, i);
+      auto elementOffsets =
+          computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
+      auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets);
+      // Insert in tuple.
+      tupleValues[i] = rewriter.create<vector::StridedSliceOp>(
+          loc, op.vector(), elementOffsets, sliceSizes, strides);
+    }
+
+    rewriter.replaceOpWithNewOp<vector::TupleOp>(op, tupleType, tupleValues);
+    return matchSuccess();
+  }
+};
+
+/// Progressive lowering of InsertSlicesOp to series of InsertStridedSliceOp.
+/// One:
+///   %x = vector.insert_slices %0
+/// is replaced by:
+///   %r0 = vector.splat 0
+//    %t1 = vector.tuple_get %0, 0
+///   %r1 = vector.insert_strided_slice %r0, %t1
+//    %t2 = vector.tuple_get %0, 1
+///   %r2 = vector.insert_strided_slice %r1, %t2
+///   ..
+///   %x  = ..
+class InsertSlicesOpLowering : public OpRewritePattern<vector::InsertSlicesOp> {
+public:
+  using OpRewritePattern<vector::InsertSlicesOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::InsertSlicesOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    VectorType vectorType = op.getResultVectorType();
+    auto shape = vectorType.getShape();
+
+    SmallVector<int64_t, 4> sizes;
+    op.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    op.getStrides(strides); // all-ones at the moment
+
+    // Prepare result.
+    auto elemType = vectorType.getElementType();
+    Value zero = rewriter.create<ConstantOp>(loc, elemType,
+                                             rewriter.getZeroAttr(elemType));
+    Value result = rewriter.create<SplatOp>(loc, vectorType, zero);
+
+    // For each element in the tuple, extract the proper strided slice.
+    TupleType tupleType = op.getSourceTupleType();
+    int64_t tupleSize = tupleType.size();
+    auto sliceStrides = computeStrides(shape, sizes);
+    for (int64_t i = 0; i < tupleSize; ++i) {
+      auto vectorOffsets = delinearize(sliceStrides, i);
+      auto elementOffsets =
+          computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets);
+      // Extract from tuple into the result.
+      auto index = rewriter.getI64IntegerAttr(i);
+      auto tupleGet = rewriter.create<vector::TupleGetOp>(
+          loc, tupleType.getType(i), op.getOperand(), index);
+      result = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, tupleGet, result, elementOffsets, strides);
+    }
+
+    rewriter.replaceOp(op, result);
+    return matchSuccess();
+  }
+};
+
+/// Progressive lowering of OuterProductOp.
+/// One:
+///   %x = vector.outerproduct %lhs, %rhs, %acc
+/// is replaced by:
+///   %z = zero-result
+///   %0 = vector.extract %lhs[0]
+///   %1 = vector.broadcast %0
+///   %2 = vector.extract %acc[0]
+///   %3 = vector.fma %1, %arg1, %2
+///   %4 = vector.insert %3, %z[0]
+///   ..
+///   %x = vector.insert %.., %..[N-1]
+///
+class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
+public:
+  using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::OuterProductOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    VectorType rhsType = op.getOperandVectorTypeRHS();
+    VectorType resType = op.getVectorType();
+    Type eltType = resType.getElementType();
+    Value acc = (op.acc().empty()) ? nullptr : op.acc()[0];
+
+    Value zero = rewriter.create<ConstantOp>(loc, eltType,
+                                             rewriter.getZeroAttr(eltType));
+    Value result = rewriter.create<SplatOp>(loc, resType, zero);
+    for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {
+      auto pos = rewriter.getI64ArrayAttr(d);
+      Value x = rewriter.create<vector::ExtractOp>(loc, eltType, op.lhs(), pos);
+      Value b = rewriter.create<vector::BroadcastOp>(loc, rhsType, x);
+      Value m;
+      if (acc) {
+        Value z = rewriter.create<vector::ExtractOp>(loc, rhsType, acc, pos);
+        m = rewriter.create<vector::FMAOp>(loc, b, op.rhs(), z);
+      } else {
+        m = rewriter.create<MulFOp>(loc, b, op.rhs());
+      }
+      result = rewriter.create<vector::InsertOp>(loc, resType, m, result, pos);
+    }
+    rewriter.replaceOp(op, result);
+    return matchSuccess();
+  }
+};
+
+/// Progressive lowering of ContractionOp.
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///   ..
+///   %x = combine %a %b ..
+/// until a pure contraction is reached (no free/batch dimensions),
+/// which is replaced by a fma/reduction op.
+///
+/// TODO(ajcbik): break down into transpose/reshape/cast ops
+///               when they become available to avoid code dup
+/// TODO(ajcbik): investigate lowering order impact on performance
+class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ContractionOp op,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(ajcbik): implement masks
+    if (llvm::size(op.masks()) != 0)
+      return matchFailure();
+
+    // TODO(ntv, ajcbik): implement benefits, cost models, separate this out in
+    // a new pattern.
+    // TODO(ntv, fhahn): once row-major mode is available in LLVM's matrix
+    // intrinsics, use that.
+    if (lowerToLLVMMatrixIntrinsics &&
+        isColumnMajorMatmul(op.indexing_maps())) {
+      VectorType lhsType = op.getLhsType();
+      VectorType rhsType = op.getRhsType();
+      Type flattenedLHSType =
+          VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
+      Type flattenedRHSType =
+          VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
+      auto lhs = rewriter.create<vector::ShapeCastOp>(
+          op.getLoc(), flattenedLHSType, op.lhs());
+      auto rhs = rewriter.create<vector::ShapeCastOp>(
+          op.getLoc(), flattenedRHSType, op.rhs());
+
+      unsigned lhsRows = op.getLhsType().getShape()[0];
+      unsigned lhsColumns = op.getLhsType().getShape()[1];
+      unsigned rhsColumns = op.getRhsType().getShape()[1];
+      Value mul = rewriter.create<vector::MatmulOp>(
+          op.getLoc(), lhs, rhs, lhsRows, lhsColumns, rhsColumns);
+      mul = rewriter.create<vector::ShapeCastOp>(op.getLoc(),
+                                                 op.acc().getType(), mul);
+      Type elementType = op.getLhsType().getElementType();
+      assert(elementType.isIntOrFloat());
+      if (elementType.isa<IntegerType>())
+        rewriter.replaceOpWithNewOp<AddIOp>(op, op.acc(), mul);
+      else
+        rewriter.replaceOpWithNewOp<AddFOp>(op, op.acc(), mul);
+      return matchSuccess();
+    }
+
+    // Find first batch dimension in LHS/RHS, and lower when found.
+    std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
+    if (!batchDimMap.empty()) {
+      int64_t lhsIndex = batchDimMap[0].first;
+      int64_t rhsIndex = batchDimMap[0].second;
+      rewriter.replaceOp(op, lowerParallel(op, lhsIndex, rhsIndex, rewriter));
+      return matchSuccess();
+    }
+
+    // Collect contracting dimensions.
+    std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
+        op.getContractingDimMap();
+    DenseSet<int64_t> lhsContractingDimSet;
+    DenseSet<int64_t> rhsContractingDimSet;
+    for (auto &dimPair : contractingDimMap) {
+      lhsContractingDimSet.insert(dimPair.first);
+      rhsContractingDimSet.insert(dimPair.second);
+    }
+
+    // Find first free dimension in LHS, and lower when found.
+    VectorType lhsType = op.getLhsType();
+    for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e;
+         ++lhsIndex) {
+      if (lhsContractingDimSet.count(lhsIndex) == 0) {
+        rewriter.replaceOp(
+            op, lowerParallel(op, lhsIndex, /*rhsIndex=*/-1, rewriter));
+        return matchSuccess();
+      }
+    }
+
+    // Find first free dimension in RHS, and lower when found.
+    VectorType rhsType = op.getRhsType();
+    for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e;
+         ++rhsIndex) {
+      if (rhsContractingDimSet.count(rhsIndex) == 0) {
+        rewriter.replaceOp(
+            op, lowerParallel(op, /*lhsIndex=*/-1, rhsIndex, rewriter));
+        return matchSuccess();
+      }
+    }
+
+    // Lower the first remaining reduction dimension.
+    if (!contractingDimMap.empty()) {
+      rewriter.replaceOp(op, lowerReduction(op, rewriter));
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+
+private:
+  // Lower one parallel dimension.
+  // TODO(ajcbik): consider reusing existing contract unrolling
+  Value lowerParallel(vector::ContractionOp op, int64_t lhsIndex,
+                      int64_t rhsIndex, PatternRewriter &rewriter) const {
+    VectorType lhsType = op.getLhsType();
+    VectorType rhsType = op.getRhsType();
+    VectorType resType = op.getResultType().cast<VectorType>();
+    // Find the iterator type index and result index.
+    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
+    int64_t iterIndex = -1;
+    int64_t dimSize = -1;
+    if (lhsIndex >= 0) {
+      iterIndex =
+          iMap[0].getResult(lhsIndex).cast<AffineDimExpr>().getPosition();
+      assert((rhsIndex < 0 || iterIndex == iMap[1]
+                                               .getResult(rhsIndex)
+                                               .cast<AffineDimExpr>()
+                                               .getPosition()) &&
+             "parallel index should be free in LHS or batch in LHS/RHS");
+      dimSize = lhsType.getDimSize(lhsIndex);
+    } else {
+      assert(rhsIndex >= 0 && "missing parallel index");
+      iterIndex =
+          iMap[1].getResult(rhsIndex).cast<AffineDimExpr>().getPosition();
+      dimSize = rhsType.getDimSize(rhsIndex);
+    }
+    assert(iterIndex >= 0 && "parallel index not listed in operand mapping");
+    Optional<int64_t> lookup = getResultIndex(iMap[2], iterIndex);
+    assert(lookup.hasValue() && "parallel index not listed in reduction");
+    int64_t resIndex = lookup.getValue();
+    // Construct new iterator types and affine map array attribute.
+    SmallVector<AffineMap, 4> lowIndexingMaps;
+    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
+    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
+    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
+    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+    auto lowIter =
+        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
+    // Unroll into a series of lower dimensional vector.contract ops.
+    Location loc = op.getLoc();
+    Value result = zeroVector(loc, resType, rewriter);
+    for (int64_t d = 0; d < dimSize; ++d) {
+      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
+      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
+      auto acc = reshapeLoad(loc, op.acc(), resType, resIndex, d, rewriter);
+      Value lowContract = rewriter.create<vector::ContractionOp>(
+          loc, lhs, rhs, acc, lowAffine, lowIter);
+      result = reshapeStore(loc, lowContract, result, resType, resIndex, d,
+                            rewriter);
+    }
+    return result;
+  }
+
+  // Lower one reduction dimension.
+  Value lowerReduction(vector::ContractionOp op,
+                       PatternRewriter &rewriter) const {
+    auto loc = op.getLoc();
+    VectorType lhsType = op.getLhsType();
+    VectorType rhsType = op.getRhsType();
+    Type resType = op.getResultType();
+    assert(!resType.isa<VectorType>());
+    // Use iterator index 0.
+    int64_t iterIndex = 0;
+    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
+    Optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
+    Optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
+    assert(lookupLhs.hasValue() && "missing LHS parallel index");
+    assert(lookupRhs.hasValue() && "missing RHS parallel index");
+    int64_t lhsIndex = lookupLhs.getValue();
+    int64_t rhsIndex = lookupRhs.getValue();
+    int64_t dimSize = lhsType.getDimSize(lhsIndex);
+    assert(dimSize == rhsType.getDimSize(rhsIndex) && "corrupt shape");
+    // Base case.
+    if (lhsType.getRank() == 1) {
+      assert(rhsType.getRank() == 1 && "corrupt contraction");
+      Value zero = zeroVector(loc, lhsType, rewriter);
+      Value fma = rewriter.create<vector::FMAOp>(loc, op.lhs(), op.rhs(), zero);
+      StringAttr kind = rewriter.getStringAttr("add");
+      return rewriter.create<vector::ReductionOp>(loc, resType, kind, fma,
+                                                  op.acc());
+    }
+    // Construct new iterator types and affine map array attribute.
+    SmallVector<AffineMap, 4> lowIndexingMaps;
+    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
+    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
+    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
+    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+    auto lowIter =
+        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
+    // Unroll into a series of lower dimensional vector.contract ops.
+    // By feeding the initial accumulator into the first contraction,
+    // and the result of each contraction into the next, eventually
+    // the sum of all reductions is computed.
+    Value result = op.acc();
+    for (int64_t d = 0; d < dimSize; ++d) {
+      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
+      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
+      result = rewriter.create<vector::ContractionOp>(loc, lhs, rhs, result,
+                                                      lowAffine, lowIter);
+    }
+    return result;
+  }
+
+  // Helper method to construct a zero vector.
+  static Value zeroVector(Location loc, VectorType vType,
+                          PatternRewriter &rewriter) {
+    Type eltType = vType.getElementType();
+    Value zero = rewriter.create<ConstantOp>(loc, eltType,
+                                             rewriter.getZeroAttr(eltType));
+    return rewriter.create<SplatOp>(loc, vType, zero);
+  }
+
+  // Helper to find an index in an affine map.
+  static Optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
+    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
+      if (idx == index)
+        return i;
+    }
+    return None;
+  }
+
+  // Helper to construct iterator types with one index removed.
+  static SmallVector<Attribute, 4> adjustIter(ArrayAttr iteratorTypes,
+                                              int64_t index) {
+    SmallVector<Attribute, 4> results;
+    for (auto it : llvm::enumerate(iteratorTypes)) {
+      int64_t idx = it.index();
+      if (idx == index)
+        continue;
+      results.push_back(it.value());
+    }
+    return results;
+  }
+
+  // Helper to construct an affine map with one index removed.
+  static AffineMap adjustMap(AffineMap map, int64_t index,
+                             PatternRewriter &rewriter) {
+    auto *ctx = rewriter.getContext();
+    SmallVector<AffineExpr, 4> results;
+    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
+      if (idx == index)
+        continue;
+      // Re-insert remaining indices, but renamed when occurring
+      // after the removed index.
+      auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
+      results.push_back(targetExpr);
+    }
+    // The (...) -> () affine map has its own factory method.
+    return results.empty() ? AffineMap::get(map.getNumDims() - 1, 0, ctx)
+                           : AffineMap::get(map.getNumDims() - 1, 0, results);
+  }
+
+  // Helper to drop dimension from vector type.
+  static Type adjustType(VectorType tp, int64_t index) {
+    int64_t rank = tp.getRank();
+    Type eltType = tp.getElementType();
+    if (rank == 1) {
+      assert(index == 0 && "index for scalar result out of bounds");
+      return eltType;
+    }
+    SmallVector<int64_t, 4> adjustedShape;
+    for (int64_t i = 0; i < rank; ++i) {
+      // Omit dimension at the given index.
+      if (i == index)
+        continue;
+      // Otherwise, add dimension back.
+      adjustedShape.push_back(tp.getDimSize(i));
+    }
+    return VectorType::get(adjustedShape, eltType);
+  }
+
+  // Helper method to possibly drop a dimension in a load.
+  // TODO(ajcbik): use a reshaping vector load (and share lowering code)
+  static Value reshapeLoad(Location loc, Value val, VectorType type,
+                           int64_t index, int64_t pos,
+                           PatternRewriter &rewriter) {
+    if (index == -1)
+      return val;
+    Type lowType = adjustType(type, 0);
+    // At extraction dimension?
+    if (index == 0) {
+      auto posAttr = rewriter.getI64ArrayAttr(pos);
+      return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
+    }
+    // Unroll leading dimensions.
+    VectorType vType = lowType.cast<VectorType>();
+    VectorType resType = adjustType(type, index).cast<VectorType>();
+    Value result = zeroVector(loc, resType, rewriter);
+    for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {
+      auto posAttr = rewriter.getI64ArrayAttr(d);
+      Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
+      Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
+      result = rewriter.create<vector::InsertOp>(loc, resType, load, result,
+                                                 posAttr);
+    }
+    return result;
+  }
+
+  // Helper method to possibly drop a dimension in a store.
+  // TODO(ajcbik): use a reshaping vector store (and share lowering code)
+  static Value reshapeStore(Location loc, Value val, Value result,
+                            VectorType type, int64_t index, int64_t pos,
+                            PatternRewriter &rewriter) {
+    // Unmodified?
+    if (index == -1)
+      return val;
+    // At insertion dimension?
+    if (index == 0) {
+      auto posAttr = rewriter.getI64ArrayAttr(pos);
+      return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
+    }
+    // Unroll leading dimensions.
+    Type lowType = adjustType(type, 0);
+    VectorType vType = lowType.cast<VectorType>();
+    Type insType = adjustType(vType, 0);
+    for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
+      auto posAttr = rewriter.getI64ArrayAttr(d);
+      Value ext =
+          rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
+      Value ins =
+          rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
+      Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
+      result =
+          rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
+    }
+    return result;
+  }
+};
+
+/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
+/// vectors progressively on the way to target llvm.matrix intrinsics.
+/// This iterates over the most major dimension of the 2-D vector and performs
+/// rewrites into:
+///   vector.extract from 2-D + vector.insert_strided_slice offset into 1-D
+class ShapeCastOp2DDownCastRewritePattern
+    : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ShapeCastOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+    if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
+      return matchFailure();
+
+    auto loc = op.getLoc();
+    auto elemType = sourceVectorType.getElementType();
+    Value zero = rewriter.create<ConstantOp>(loc, elemType,
+                                             rewriter.getZeroAttr(elemType));
+    Value desc = rewriter.create<SplatOp>(loc, resultVectorType, zero);
+    unsigned mostMinorVectorSize = sourceVectorType.getShape()[1];
+    for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) {
+      Value vec = rewriter.create<vector::ExtractOp>(loc, op.source(), i);
+      desc = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, vec, desc,
+          /*offsets=*/i * mostMinorVectorSize, /*strides=*/1);
+    }
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
+/// vectors progressively on the way from targeting llvm.matrix intrinsics.
+/// This iterates over the most major dimension of the 2-D vector and performs
+/// rewrites into:
+///   vector.strided_slice from 1-D + vector.insert into 2-D
+class ShapeCastOp2DUpCastRewritePattern
+    : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::ShapeCastOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+    if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
+      return matchFailure();
+
+    auto loc = op.getLoc();
+    auto elemType = sourceVectorType.getElementType();
+    Value zero = rewriter.create<ConstantOp>(loc, elemType,
+                                             rewriter.getZeroAttr(elemType));
+    Value desc = rewriter.create<SplatOp>(loc, resultVectorType, zero);
+    unsigned mostMinorVectorSize = resultVectorType.getShape()[1];
+    for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) {
+      Value vec = rewriter.create<vector::StridedSliceOp>(
+          loc, op.source(), /*offsets=*/i * mostMinorVectorSize,
+          /*sizes=*/mostMinorVectorSize,
+          /*strides=*/1);
+      desc = rewriter.create<vector::InsertOp>(loc, vec, desc, i);
+    }
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+} // namespace
+
+// TODO(andydavis) Add pattern to rewrite ExtractSlices(ConstantMaskOp).
+// TODO(andydavis) Add this as DRR pattern.
+void mlir::vector::populateVectorToVectorTransformationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<ShapeCastOpDecomposer, ShapeCastOpFolder, SplitTransferReadOp,
+                  SplitTransferWriteOp, TupleGetFolderOp>(context);
+}
+
+void mlir::vector::populateVectorSlicesLoweringPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<ExtractSlicesOpLowering, InsertSlicesOpLowering>(context);
+}
+
+void mlir::vector::populateVectorContractLoweringPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<ContractionOpLowering, ShapeCastOp2DDownCastRewritePattern,
+                  ShapeCastOp2DUpCastRewritePattern, OuterProductOpLowering>(
+      context);
+}

diff  --git a/mlir/lib/Dialect/VectorOps/VectorUtils.cpp b/mlir/lib/Dialect/VectorOps/VectorUtils.cpp
new file mode 100644
index 000000000000..46a990080a4f
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/VectorUtils.cpp
@@ -0,0 +1,278 @@
+//===- VectorUtils.cpp - MLIR Utilities for VectorOps   ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utility methods for working with the VectorOps dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/VectorOps/VectorUtils.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+
+using llvm::SetVector;
+
+namespace mlir {
+
+SmallVector<int64_t, 4> computeStrides(ArrayRef<int64_t> shape,
+                                       ArrayRef<int64_t> sizes) {
+  int64_t rank = shape.size();
+  // Compute the count for each dimension.
+  SmallVector<int64_t, 4> sliceDimCounts(rank);
+  for (int64_t r = 0; r < rank; ++r)
+    sliceDimCounts[r] = ceilDiv(shape[r], sizes[r]);
+  // Use that to compute the slice stride for each dimension.
+  SmallVector<int64_t, 4> sliceStrides(rank);
+  sliceStrides[rank - 1] = 1;
+  for (int64_t r = rank - 2; r >= 0; --r)
+    sliceStrides[r] = sliceStrides[r + 1] * sliceDimCounts[r + 1];
+  return sliceStrides;
+}
+
+SmallVector<int64_t, 4> delinearize(ArrayRef<int64_t> sliceStrides,
+                                    int64_t index) {
+  int64_t rank = sliceStrides.size();
+  SmallVector<int64_t, 4> vectorOffsets(rank);
+  for (int64_t r = 0; r < rank; ++r) {
+    assert(sliceStrides[r] > 0);
+    vectorOffsets[r] = index / sliceStrides[r];
+    index %= sliceStrides[r];
+  }
+  return vectorOffsets;
+}
+
+SmallVector<int64_t, 4>
+computeElementOffsetsFromVectorSliceOffsets(ArrayRef<int64_t> sizes,
+                                            ArrayRef<int64_t> vectorOffsets) {
+  return functional::zipMap([](int64_t v1, int64_t v2) { return v1 * v2; },
+                            vectorOffsets, sizes);
+}
+
+SmallVector<int64_t, 4> computeSliceSizes(ArrayRef<int64_t> shape,
+                                          ArrayRef<int64_t> sizes,
+                                          ArrayRef<int64_t> elementOffsets) {
+  int64_t rank = shape.size();
+  SmallVector<int64_t, 4> sliceSizes(rank);
+  for (unsigned r = 0; r < rank; ++r)
+    sliceSizes[r] = std::min(sizes[r], shape[r] - elementOffsets[r]);
+  return sliceSizes;
+}
+
+Optional<SmallVector<int64_t, 4>> shapeRatio(ArrayRef<int64_t> superShape,
+                                             ArrayRef<int64_t> subShape) {
+  if (superShape.size() < subShape.size()) {
+    return Optional<SmallVector<int64_t, 4>>();
+  }
+
+  // Starting from the end, compute the integer divisors.
+  // Set the boolean `divides` if integral division is not possible.
+  std::vector<int64_t> result;
+  result.reserve(superShape.size());
+  bool divides = true;
+  auto divide = [&divides, &result](int superSize, int subSize) {
+    assert(superSize > 0 && "superSize must be > 0");
+    assert(subSize > 0 && "subSize must be > 0");
+    divides &= (superSize % subSize == 0);
+    result.push_back(superSize / subSize);
+  };
+  functional::zipApply(
+      divide, SmallVector<int64_t, 8>{superShape.rbegin(), superShape.rend()},
+      SmallVector<int64_t, 8>{subShape.rbegin(), subShape.rend()});
+
+  // If integral division does not occur, return and let the caller decide.
+  if (!divides) {
+    return None;
+  }
+
+  // At this point we computed the ratio (in reverse) for the common
+  // size. Fill with the remaining entries from the super-vector shape (still in
+  // reverse).
+  int commonSize = subShape.size();
+  std::copy(superShape.rbegin() + commonSize, superShape.rend(),
+            std::back_inserter(result));
+
+  assert(result.size() == superShape.size() &&
+         "super to sub shape ratio is not of the same size as the super rank");
+
+  // Reverse again to get it back in the proper order and return.
+  return SmallVector<int64_t, 4>{result.rbegin(), result.rend()};
+}
+
+Optional<SmallVector<int64_t, 4>> shapeRatio(VectorType superVectorType,
+                                             VectorType subVectorType) {
+  assert(superVectorType.getElementType() == subVectorType.getElementType() &&
+         "vector types must be of the same elemental type");
+  return shapeRatio(superVectorType.getShape(), subVectorType.getShape());
+}
+
+/// Constructs a permutation map from memref indices to vector dimension.
+///
+/// The implementation uses the knowledge of the mapping of enclosing loop to
+/// vector dimension. `enclosingLoopToVectorDim` carries this information as a
+/// map with:
+///   - keys representing "vectorized enclosing loops";
+///   - values representing the corresponding vector dimension.
+/// The algorithm traverses "vectorized enclosing loops" and extracts the
+/// at-most-one MemRef index that is invariant along said loop. This index is
+/// guaranteed to be at most one by construction: otherwise the MemRef is not
+/// vectorizable.
+/// If this invariant index is found, it is added to the permutation_map at the
+/// proper vector dimension.
+/// If no index is found to be invariant, 0 is added to the permutation_map and
+/// corresponds to a vector broadcast along that dimension.
+///
+/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
+/// signalling that no permutation map can be constructed given
+/// `enclosingLoopToVectorDim`.
+///
+/// Examples can be found in the documentation of `makePermutationMap`, in the
+/// header file.
+static AffineMap makePermutationMap(
+    ArrayRef<Value> indices,
+    const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
+  if (enclosingLoopToVectorDim.empty())
+    return AffineMap();
+  MLIRContext *context =
+      enclosingLoopToVectorDim.begin()->getFirst()->getContext();
+  using functional::makePtrDynCaster;
+  using functional::map;
+  SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
+                                  getAffineConstantExpr(0, context));
+
+  for (auto kvp : enclosingLoopToVectorDim) {
+    assert(kvp.second < perm.size());
+    auto invariants = getInvariantAccesses(
+        cast<AffineForOp>(kvp.first).getInductionVar(), indices);
+    unsigned numIndices = indices.size();
+    unsigned countInvariantIndices = 0;
+    for (unsigned dim = 0; dim < numIndices; ++dim) {
+      if (!invariants.count(indices[dim])) {
+        assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
+               "permutationMap already has an entry along dim");
+        perm[kvp.second] = getAffineDimExpr(dim, context);
+      } else {
+        ++countInvariantIndices;
+      }
+    }
+    assert((countInvariantIndices == numIndices ||
+            countInvariantIndices == numIndices - 1) &&
+           "Vectorization prerequisite violated: at most 1 index may be "
+           "invariant wrt a vectorized loop");
+  }
+  return AffineMap::get(indices.size(), 0, perm);
+}
+
+/// Implementation detail that walks up the parents and records the ones with
+/// the specified type.
+/// TODO(ntv): could also be implemented as a collect parents followed by a
+/// filter and made available outside this file.
+template <typename T>
+static SetVector<Operation *> getParentsOfType(Operation *op) {
+  SetVector<Operation *> res;
+  auto *current = op;
+  while (auto *parent = current->getParentOp()) {
+    if (auto typedParent = dyn_cast<T>(parent)) {
+      assert(res.count(parent) == 0 && "Already inserted");
+      res.insert(parent);
+    }
+    current = parent;
+  }
+  return res;
+}
+
+/// Returns the enclosing AffineForOp, from closest to farthest.
+static SetVector<Operation *> getEnclosingforOps(Operation *op) {
+  return getParentsOfType<AffineForOp>(op);
+}
+
+AffineMap
+makePermutationMap(Operation *op, ArrayRef<Value> indices,
+                   const DenseMap<Operation *, unsigned> &loopToVectorDim) {
+  DenseMap<Operation *, unsigned> enclosingLoopToVectorDim;
+  auto enclosingLoops = getEnclosingforOps(op);
+  for (auto *forInst : enclosingLoops) {
+    auto it = loopToVectorDim.find(forInst);
+    if (it != loopToVectorDim.end()) {
+      enclosingLoopToVectorDim.insert(*it);
+    }
+  }
+  return makePermutationMap(indices, enclosingLoopToVectorDim);
+}
+
+bool matcher::operatesOnSuperVectorsOf(Operation &op,
+                                       VectorType subVectorType) {
+  // First, extract the vector type and distinguish between:
+  //   a. ops that *must* lower a super-vector (i.e. vector.transfer_read,
+  //      vector.transfer_write); and
+  //   b. ops that *may* lower a super-vector (all other ops).
+  // The ops that *may* lower a super-vector only do so if the super-vector to
+  // sub-vector ratio exists. The ops that *must* lower a super-vector are
+  // explicitly checked for this property.
+  /// TODO(ntv): there should be a single function for all ops to do this so we
+  /// do not have to special case. Maybe a trait, or just a method, unclear atm.
+  bool mustDivide = false;
+  (void)mustDivide;
+  VectorType superVectorType;
+  if (auto read = dyn_cast<vector::TransferReadOp>(op)) {
+    superVectorType = read.getVectorType();
+    mustDivide = true;
+  } else if (auto write = dyn_cast<vector::TransferWriteOp>(op)) {
+    superVectorType = write.getVectorType();
+    mustDivide = true;
+  } else if (op.getNumResults() == 0) {
+    if (!isa<ReturnOp>(op)) {
+      op.emitError("NYI: assuming only return operations can have 0 "
+                   " results at this point");
+    }
+    return false;
+  } else if (op.getNumResults() == 1) {
+    if (auto v = op.getResult(0).getType().dyn_cast<VectorType>()) {
+      superVectorType = v;
+    } else {
+      // Not a vector type.
+      return false;
+    }
+  } else {
+    // Not a vector.transfer and has more than 1 result, fail hard for now to
+    // wake us up when something changes.
+    op.emitError("NYI: operation has more than 1 result");
+    return false;
+  }
+
+  // Get the ratio.
+  auto ratio = shapeRatio(superVectorType, subVectorType);
+
+  // Sanity check.
+  assert((ratio.hasValue() || !mustDivide) &&
+         "vector.transfer operation in which super-vector size is not an"
+         " integer multiple of sub-vector size");
+
+  // This catches cases that are not strictly necessary to have multiplicity but
+  // still aren't divisible by the sub-vector shape.
+  // This could be useful information if we wanted to reshape at the level of
+  // the vector type (but we would have to look at the compute and distinguish
+  // between parallel, reduction and possibly other cases.
+  if (!ratio.hasValue()) {
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace mlir

diff  --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 6d1d7b41d568..ac2bafb39e3a 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_subdirectory(Utils)
 
 add_mlir_library(MLIRTransforms
-  AffineDataCopyGeneration.cpp
-  AffineLoopInvariantCodeMotion.cpp
   Canonicalizer.cpp
   CSE.cpp
   DialectConversion.cpp
@@ -17,7 +15,6 @@ add_mlir_library(MLIRTransforms
   MemRefDataFlowOpt.cpp
   OpStats.cpp
   PipelineDataTransfer.cpp
-  SimplifyAffineStructures.cpp
   StripDebugInfo.cpp
   SymbolDCE.cpp
   Vectorize.cpp
@@ -33,7 +30,7 @@ add_mlir_library(MLIRTransforms
 
 target_link_libraries(MLIRTransforms
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIRLoopLikeInterface
   MLIRLoopOps

diff  --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index 378e91a214bd..2f08f95261f2 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"

diff  --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp
index 0c411144df9c..49bcc5897c48 100644
--- a/mlir/lib/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Transforms/LoopTiling.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"

diff  --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Transforms/LoopUnroll.cpp
index 2083a1226879..ca1ed7729345 100644
--- a/mlir/lib/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Transforms/LoopUnroll.cpp
@@ -13,7 +13,7 @@
 #include "mlir/Transforms/Passes.h"
 
 #include "mlir/Analysis/LoopAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"

diff  --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
index 4629e2c94bd4..7d4be3490114 100644
--- a/mlir/lib/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
@@ -35,7 +35,7 @@
 #include "mlir/Transforms/Passes.h"
 
 #include "mlir/Analysis/LoopAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"

diff  --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
index c1128c949baf..eaf5c744723c 100644
--- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
+++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/Dominance.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/Passes.h"

diff  --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index 39874b1bc44a..df7bafc4b90b 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"

diff  --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
index 1e0442179bf4..e28a97c18231 100644
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -16,7 +16,7 @@ add_mlir_library(MLIRTransformUtils
 
 target_link_libraries(MLIRTransformUtils
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIRLoopAnalysis
   MLIRLoopOps

diff  --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
index 47a3fcf3d519..9ed4283101af 100644
--- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"

diff  --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 96b4e72eff48..4f6f2b748fec 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"

diff  --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index 844befecea36..4ab773eb38df 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -18,7 +18,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Dominance.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Module.h"

diff  --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Transforms/Vectorize.cpp
index 75a7d4d5cf6b..b89702b50188 100644
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Transforms/Vectorize.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"

diff  --git a/mlir/test/Dialect/AffineOps/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/canonicalize.mlir
rename to mlir/test/Dialect/Affine/canonicalize.mlir

diff  --git a/mlir/test/Dialect/AffineOps/dma.mlir b/mlir/test/Dialect/Affine/dma.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/dma.mlir
rename to mlir/test/Dialect/Affine/dma.mlir

diff  --git a/mlir/test/Dialect/AffineOps/inlining.mlir b/mlir/test/Dialect/Affine/inlining.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/inlining.mlir
rename to mlir/test/Dialect/Affine/inlining.mlir

diff  --git a/mlir/test/Dialect/AffineOps/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/invalid.mlir
rename to mlir/test/Dialect/Affine/invalid.mlir

diff  --git a/mlir/test/Dialect/AffineOps/load-store-invalid.mlir b/mlir/test/Dialect/Affine/load-store-invalid.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/load-store-invalid.mlir
rename to mlir/test/Dialect/Affine/load-store-invalid.mlir

diff  --git a/mlir/test/Dialect/AffineOps/load-store.mlir b/mlir/test/Dialect/Affine/load-store.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/load-store.mlir
rename to mlir/test/Dialect/Affine/load-store.mlir

diff  --git a/mlir/test/Dialect/AffineOps/memref-stride-calculation.mlir b/mlir/test/Dialect/Affine/memref-stride-calculation.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/memref-stride-calculation.mlir
rename to mlir/test/Dialect/Affine/memref-stride-calculation.mlir

diff  --git a/mlir/test/Dialect/AffineOps/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir
similarity index 100%
rename from mlir/test/Dialect/AffineOps/ops.mlir
rename to mlir/test/Dialect/Affine/ops.mlir

diff  --git a/mlir/test/EDSC/CMakeLists.txt b/mlir/test/EDSC/CMakeLists.txt
index c3c0cf35497b..6c2f5f9fd0be 100644
--- a/mlir/test/EDSC/CMakeLists.txt
+++ b/mlir/test/EDSC/CMakeLists.txt
@@ -6,7 +6,7 @@ llvm_update_compile_flags(mlir-edsc-builder-api-test)
 
 target_link_libraries(mlir-edsc-builder-api-test
   PRIVATE
-  MLIRAffineOps
+  MLIRAffine
   MLIREDSC
   MLIRIR
   MLIRLinalgEDSC
@@ -22,7 +22,7 @@ target_link_libraries(mlir-edsc-builder-api-test
 target_include_directories(mlir-edsc-builder-api-test PRIVATE ..)
 
 whole_archive_link(mlir-edsc-builder-api-test
-  MLIRAffineOps
+  MLIRAffine
   MLIRLinalgOps
   MLIRLoopOps
   MLIRStandardOps

diff  --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index b60d5894df03..a9bb0b62dbdf 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -8,7 +8,7 @@
 
 // RUN: mlir-edsc-builder-api-test | FileCheck %s -dump-input-on-failure
 
-#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/LoopOps/EDSC/Builders.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
@@ -38,7 +38,7 @@ using namespace mlir::edsc::intrinsics;
 
 static MLIRContext &globalContext() {
   static bool init_once = []() {
-    registerDialect<AffineOpsDialect>();
+    registerDialect<AffineDialect>();
     registerDialect<linalg::LinalgDialect>();
     registerDialect<loop::LoopOpsDialect>();
     registerDialect<StandardOpsDialect>();

diff  --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
new file mode 100644
index 000000000000..905d1c1adf3d
--- /dev/null
+++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_library(MLIRAffineTransformsTestPasses
+  TestAffineDataCopy.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+  )
+
+target_link_libraries(MLIRAffineTransformsTestPasses PRIVATE
+  MLIRIR
+  MLIRPass
+  MLIRAffineTransforms
+  MLIRSupport
+  )

diff  --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
similarity index 98%
rename from mlir/test/lib/Transforms/TestAffineDataCopy.cpp
rename to mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
index 966df287359a..af380d8ca212 100644
--- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
@@ -13,7 +13,7 @@
 
 #include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"

diff  --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index cc1766c6127a..160fe9c203da 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -1 +1,2 @@
+add_subdirectory(Affine)
 add_subdirectory(SPIRV)

diff  --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 61d1443869a9..e90099203b43 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(MLIRTestTransforms
-  TestAffineDataCopy.cpp
   TestAllReduceLowering.cpp
   TestCallGraph.cpp
   TestConstantFold.cpp
@@ -36,7 +35,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms)
 
 target_link_libraries(MLIRTestTransforms
   PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIREDSC
   MLIRGPU

diff  --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp
index f45b0ae18d50..cc6ece7f7c46 100644
--- a/mlir/test/lib/Transforms/TestConstantFold.cpp
+++ b/mlir/test/lib/Transforms/TestConstantFold.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"

diff  --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp
index d650288836d1..8860b4adfce7 100644
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"

diff  --git a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
index e107bf81cfe2..a30631f139c1 100644
--- a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
+++ b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"

diff  --git a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
index e2d0c873f959..3c80f5baf348 100644
--- a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
+++ b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"

diff  --git a/mlir/test/lib/Transforms/TestParallelismDetection.cpp b/mlir/test/lib/Transforms/TestParallelismDetection.cpp
index 7c16a259723f..1041c78276b3 100644
--- a/mlir/test/lib/Transforms/TestParallelismDetection.cpp
+++ b/mlir/test/lib/Transforms/TestParallelismDetection.cpp
@@ -12,7 +12,7 @@
 
 #include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 

diff  --git a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp b/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
index 4ae4509bc56d..af42c90521d4 100644
--- a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
@@ -13,7 +13,7 @@
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"

diff  --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 9ade33aceabe..2ae36d466dd4 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -9,6 +9,7 @@ set(LIBS
   ${conversion_libs}
   MLIRLoopOpsTransforms
   MLIRLoopAnalysis
+  MLIRAffineTransformsTestPasses
   MLIRAnalysis
   MLIRDialect
   MLIREDSC