[Mlir-commits] [mlir] 14d0735 - [MLIR][Affine][VectorOps] Utility to vectorize loop nest using strategy

Mon Sep 21 16:37:42 PDT 2020

Author: Diego Caballero
Date: 2020-09-21T16:28:28-07:00
New Revision: 14d0735d3453fb6403da916d7aee6a9f25af4147

URL: https://github.com/llvm/llvm-project/commit/14d0735d3453fb6403da916d7aee6a9f25af4147
DIFF: https://github.com/llvm/llvm-project/commit/14d0735d3453fb6403da916d7aee6a9f25af4147.diff

LOG: [MLIR][Affine][VectorOps] Utility to vectorize loop nest using strategy

This patch adds a utility based on SuperVectorizer to vectorize an
affine loop nest using a given vectorization strategy. This strategy allows
targeting specific loops for vectorization instead of relying of the
SuperVectorizer analysis to choose the right loops to vectorize.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D85869

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Affine/Utils.h
    mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
    mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
    mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index 2e563d9e3ba4..c83955eb0891 100644

--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -14,6 +14,8 @@
 #define MLIR_DIALECT_AFFINE_UTILS_H
 
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace mlir {
 
@@ -34,6 +36,47 @@ void affineParallelize(AffineForOp forOp);
 /// significant code expansion in some cases.
 LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
 
+/// Holds parameters to perform n-D vectorization on a single loop nest.
+/// For example, for the following loop nest:
+///
+/// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) {
+///   affine.for %i0 = 0 to 64 {
+///     affine.for %i1 = 0 to 128 {
+///       affine.for %i2 = 0 to 512 {
+///         %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32>
+///         affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32>
+///       }
+///     }
+///   }
+///   return
+/// }
+///
+/// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim =
+/// {{i1->0}, {i2->1}}', SuperVectorizer will generate:
+///
+///  func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) {
+///    affine.for %arg2 = 0 to 64 {
+///      affine.for %arg3 = 0 to 128 step 8 {
+///        affine.for %arg4 = 0 to 512 step 4 {
+///          %cst = constant 0.000000e+00 : f32
+///          %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ...
+///          vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ...
+///        }
+///      }
+///    }
+///    return
+///  }
+// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
+struct VectorizationStrategy {
+  // Vectorization factors to apply to each target vector dimension.
+  // Each factor will be applied to a 
diff erent loop.
+  SmallVector<int64_t, 8> vectorSizes;
+  // Maps each AffineForOp vectorization candidate with its vector dimension.
+  // The candidate will be vectorized using the vectorization factor in
+  // 'vectorSizes' for that dimension.
+  DenseMap<Operation *, unsigned> loopToVectorDim;
+};
+
 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
 /// 'vectorSizes'. By default, each vectorization factor is applied
 /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
@@ -43,6 +86,45 @@ void vectorizeAffineLoops(
     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
     ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
 
+/// External utility to vectorize affine loops from a single loop nest using an
+/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
+/// Loops are provided in a 2D vector container. The first dimension represents
+/// the nesting level relative to the loops to be vectorized. The second
+/// dimension contains the loops. This means that:
+///   a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
+///   b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
+///
+/// For example, for the following loop nest:
+///
+///   func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
+///               %out0: memref<64x128x512xf32>,
+///               %out1: memref<64x128x128xf32>) {
+///     affine.for %i0 = 0 to 64 {
+///       affine.for %i1 = 0 to 128 {
+///         affine.for %i2 = 0 to 512 {
+///           %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
+///           affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
+///         }
+///         affine.for %i3 = 0 to 128 {
+///           %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
+///           affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
+///         }
+///       }
+///     }
+///     return
+///   }
+///
+/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
+/// innermost loops;
+/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
+/// loops;
+/// loops = {{%i2}}, to vectorize only the first innermost loop;
+/// loops = {{%i3}}, to vectorize only the second innermost loop;
+/// loops = {{%i1}}, to vectorize only the middle loop.
+LogicalResult
+vectorizeAffineLoopNest(const std::vector<SmallVector<AffineForOp, 2>> &loops,
+                        const VectorizationStrategy &strategy);
+
 /// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
 /// As currently implemented, this transformation cannot fail and will return
 /// early if the op is already in a normalized form.

diff  --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index ee52fe44830c..5cc65ecc7ef7 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -254,8 +254,8 @@ using namespace vector;
 ///     interference);
 ///  3. Then, for each pattern in order:
 ///    a. applying iterative rewriting of the loop and the load operations in
-///       DFS postorder. Rewriting is implemented by coarsening the loops and
-///       turning load operations into opaque vector.transfer_read ops;
+///       inner-to-outer order. Rewriting is implemented by coarsening the loops
+///       and turning load operations into opaque vector.transfer_read ops;
 ///    b. keeping track of the load operations encountered as "roots" and the
 ///       store operations as "terminals";
 ///    c. traversing the use-def chains starting from the roots and iteratively
@@ -584,17 +584,6 @@ Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
   vectorSizes = virtualVectorSize;
 }
 
-/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
-/////////
-namespace {
-
-struct VectorizationStrategy {
-  SmallVector<int64_t, 8> vectorSizes;
-  DenseMap<Operation *, unsigned> loopToVectorDim;
-};
-
-} // end anonymous namespace
-
 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
                                       unsigned patternDepth,
                                       VectorizationStrategy *strategy) {
@@ -857,44 +846,44 @@ isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
   };
 }
 
-/// Apply vectorization of `loop` according to `state`. This is only triggered
-/// if all vectorizations in `childrenMatches` have already succeeded
-/// recursively in DFS post-order.
+/// Apply vectorization of `loop` according to `state`. `loops` are processed in
+/// inner-to-outer order to ensure that all the children loops have already been
+/// vectorized before vectorizing the parent loop.
 static LogicalResult
-vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
-                                  VectorizationState *state) {
-  auto *loopInst = oneMatch.getMatchedOperation();
-  auto loop = cast<AffineForOp>(loopInst);
-  auto childrenMatches = oneMatch.getMatchedChildren();
-
-  // 1. DFS postorder recursion, if any of my children fails, I fail too.
-  for (auto m : childrenMatches) {
-    if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
-      return failure();
-    }
-  }
+vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
+                       VectorizationState *state) {
+  // Vectorize loops in inner-to-outer order. If any children fails, the parent
+  // will fail too.
+  for (auto &loopsInLevel : llvm::reverse(loops)) {
+    for (AffineForOp loop : loopsInLevel) {
+      // 1. This loop may have been omitted from vectorization for various
+      // reasons (e.g. due to the performance model or pattern depth > vector
+      // size).
+      auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
+      if (it == state->strategy->loopToVectorDim.end())
+        continue;
 
-  // 2. This loop may have been omitted from vectorization for various reasons
-  // (e.g. due to the performance model or pattern depth > vector size).
-  auto it = state->strategy->loopToVectorDim.find(loopInst);
-  if (it == state->strategy->loopToVectorDim.end()) {
-    return success();
+      // 2. Actual inner-to-outer transformation.
+      auto vectorDim = it->second;
+      assert(vectorDim < state->strategy->vectorSizes.size() &&
+             "vector dim overflow");
+      //   a. get actual vector size
+      auto vectorSize = state->strategy->vectorSizes[vectorDim];
+      //   b. loop transformation for early vectorization is still subject to
+      //     exploratory tradeoffs (see top of the file). Apply coarsening,
+      //     i.e.:
+      //        | ub -> ub
+      //        | step -> step * vectorSize
+      LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+                        << " : \n"
+                        << loop);
+      if (failed(
+              vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
+        return failure();
+    } // end for.
   }
 
-  // 3. Actual post-order transformation.
-  auto vectorDim = it->second;
-  assert(vectorDim < state->strategy->vectorSizes.size() &&
-         "vector dim overflow");
-  //   a. get actual vector size
-  auto vectorSize = state->strategy->vectorSizes[vectorDim];
-  //   b. loop transformation for early vectorization is still subject to
-  //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
-  //        | ub -> ub
-  //        | step -> step * vectorSize
-  LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
-                    << " : ");
-  LLVM_DEBUG(loopInst->print(dbgs()));
-  return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
+  return success();
 }
 
 /// Tries to transform a scalar constant into a vector splat of that constant.
@@ -1145,16 +1134,46 @@ static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
   return success();
 }
 
-/// Vectorization is a recursive procedure where anything below can fail.
-/// The root match thus needs to maintain a clone for handling failure.
-/// Each root may succeed independently but will otherwise clean after itself if
-/// anything below it fails.
-static LogicalResult vectorizeRootMatch(NestedMatch m,
-                                        VectorizationStrategy *strategy) {
-  auto loop = cast<AffineForOp>(m.getMatchedOperation());
-  OperationFolder folder(loop.getContext());
+/// Recursive implementation to convert all the nested loops in 'match' to a 2D
+/// vector container that preserves the relative nesting level of each loop with
+/// respect to the others in 'match'. 'currentLevel' is the nesting level that
+/// will be assigned to the loop in the current 'match'.
+static void
+getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
+                         std::vector<SmallVector<AffineForOp, 2>> &loops) {
+  // Add a new empty level to the output if it doesn't exist already.
+  assert(currentLevel <= loops.size() && "Unexpected currentLevel");
+  if (currentLevel == loops.size())
+    loops.push_back(SmallVector<AffineForOp, 2>());
+
+  // Add current match and recursively visit its children.
+  loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
+  for (auto childMatch : match.getMatchedChildren()) {
+    getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops);
+  }
+}
+
+/// Converts all the nested loops in 'match' to a 2D vector container that
+/// preserves the relative nesting level of each loop with respect to the others
+/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
+/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
+/// 'loops[i+1]'.
+static void
+getMatchedAffineLoops(NestedMatch match,
+                      std::vector<SmallVector<AffineForOp, 2>> &loops) {
+  getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops);
+}
+
+/// Internal implementation to vectorize affine loops from a single loop nest
+/// using an n-D vectorization strategy.
+static LogicalResult
+vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
+                  const VectorizationStrategy &strategy) {
+  assert(loops[0].size() == 1 && "Expected single root loop");
+  AffineForOp rootLoop = loops[0][0];
+  OperationFolder folder(rootLoop.getContext());
   VectorizationState state;
-  state.strategy = strategy;
+  state.strategy = &strategy;
   state.folder = &folder;
 
   // Since patterns are recursive, they can very well intersect.
@@ -1164,7 +1183,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
   // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
   // TODO: implement a non-greedy profitability analysis that keeps only
   // non-intersecting patterns.
-  if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
+  if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
     return failure();
   }
@@ -1172,7 +1191,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
   /// Sets up error handling for this root loop. This is how the root match
   /// maintains a clone for handling failure and restores the proper state via
   /// RAII.
-  auto *loopInst = loop.getOperation();
+  auto *loopInst = rootLoop.getOperation();
   OpBuilder builder(loopInst);
   auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
   struct Guard {
@@ -1187,17 +1206,17 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
     }
     AffineForOp loop;
     AffineForOp clonedLoop;
-  } guard{loop, clonedLoop};
+  } guard{rootLoop, clonedLoop};
 
   //////////////////////////////////////////////////////////////////////////////
   // Start vectorizing.
   // From now on, any error triggers the scope guard above.
   //////////////////////////////////////////////////////////////////////////////
-  // 1. Vectorize all the loops matched by the pattern, recursively.
+  // 1. Vectorize all the loop candidates, in inner-to-outer order.
   // This also vectorizes the roots (AffineLoadOp) as well as registers the
   // terminals (AffineStoreOp) for post-processing vectorization (we need to
   // wait for all use-def chains into them to be vectorized first).
-  if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
+  if (failed(vectorizeLoopsAndLoads(loops, &state))) {
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
     return guard.failure();
   }
@@ -1229,38 +1248,25 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
   return guard.success();
 }
 
-/// Applies vectorization to the current Function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnFunction() {
-  FuncOp f = getFunction();
-  if (!fastestVaryingPattern.empty() &&
-      fastestVaryingPattern.size() != vectorSizes.size()) {
-    f.emitRemark("Fastest varying pattern specified with 
diff erent size than "
-                 "the vector size.");
-    return signalPassFailure();
-  }
-
-  DenseSet<Operation *> parallelLoops;
-  f.walk([&parallelLoops](AffineForOp loop) {
-    if (isLoopParallel(loop))
-      parallelLoops.insert(loop);
-  });
-
-  vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+/// Vectorization is a recursive procedure where anything below can fail. The
+/// root match thus needs to maintain a clone for handling failure. Each root
+/// may succeed independently but will otherwise clean after itself if anything
+/// below it fails.
+static LogicalResult vectorizeRootMatch(NestedMatch m,
+                                        const VectorizationStrategy &strategy) {
+  std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
+  getMatchedAffineLoops(m, loopsToVectorize);
+  return vectorizeLoopNest(loopsToVectorize, strategy);
 }
 
-namespace mlir {
-
-/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
-/// 'vectorSizes'. By default, each vectorization factor is applied
-/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
-/// be optionally used to provide a 
diff erent loop vectorization order.
-void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
-                          ArrayRef<int64_t> vectorSizes,
-                          ArrayRef<int64_t> fastestVaryingPattern) {
-  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
-  NestedPatternContext mlContext;
-
+/// Internal implementation to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a 
diff erent loop
+/// vectorization order.
+static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+                           ArrayRef<int64_t> vectorSizes,
+                           ArrayRef<int64_t> fastestVaryingPattern) {
   for (auto &pat :
        makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
     LLVM_DEBUG(dbgs() << "\n******************************************");
@@ -1286,7 +1292,7 @@ void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
                                 &strategy);
       // TODO: if pattern does not apply, report it; alter the
       // cost/benefit.
-      vectorizeRootMatch(m, &strategy);
+      vectorizeRootMatch(m, strategy);
       // TODO: some diagnostics if failure to vectorize occurs.
     }
   }
@@ -1301,4 +1307,127 @@ std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
   return std::make_unique<Vectorize>();
 }
 
+/// Applies vectorization to the current function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnFunction() {
+  FuncOp f = getFunction();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitRemark("Fastest varying pattern specified with 
diff erent size than "
+                 "the vector size.");
+    return signalPassFailure();
+  }
+
+  DenseSet<Operation *> parallelLoops;
+  f.walk([&parallelLoops](AffineForOp loop) {
+    if (isLoopParallel(loop))
+      parallelLoops.insert(loop);
+  });
+
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+}
+
+/// Verify that affine loops in 'loops' meet the nesting criteria expected by
+/// SuperVectorizer:
+///   * There must be at least one loop.
+///   * There must be a single root loop (nesting level 0).
+///   * Each loop at a given nesting level must be nested in a loop from a
+///     previous nesting level.
+static void
+verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
+  assert(!loops.empty() && "Expected at least one loop");
+  assert(!loops[0].size() && "Expected only one root loop");
+
+  // Traverse loops outer-to-inner to check some invariants.
+  for (int i = 1, end = loops.size(); i < end; ++i) {
+    for (AffineForOp loop : loops[i]) {
+      //  Check that each loop at this level is nested in one of the loops from
+      //  the previous level.
+      bool parentFound = false;
+      for (AffineForOp maybeParent : loops[i - 1]) {
+        if (maybeParent.getOperation()->isProperAncestor(loop)) {
+          parentFound = true;
+          break;
+        }
+      }
+      assert(parentFound && "Child loop not nested in any parent loop");
+
+      //  Check that each loop at this level is not nested in another loop from
+      //  this level.
+      for (AffineForOp sibling : loops[i])
+        assert(!sibling.getOperation()->isProperAncestor(loop) &&
+               "Loops at the same level are nested");
+    }
+  }
+}
+
+namespace mlir {
+
+/// External utility to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a 
diff erent loop
+/// vectorization order.
+void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+                          ArrayRef<int64_t> vectorSizes,
+                          ArrayRef<int64_t> fastestVaryingPattern) {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+}
+
+/// External utility to vectorize affine loops from a single loop nest using an
+/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
+/// Loops are provided in a 2D vector container. The first dimension represents
+/// the nesting level relative to the loops to be vectorized. The second
+/// dimension contains the loops. This means that:
+///   a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
+///   b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
+///
+/// For example, for the following loop nest:
+///
+///   func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
+///               %out0: memref<64x128x512xf32>,
+///               %out1: memref<64x128x128xf32>) {
+///     affine.for %i0 = 0 to 64 {
+///       affine.for %i1 = 0 to 128 {
+///         affine.for %i2 = 0 to 512 {
+///           %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
+///           affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
+///         }
+///         affine.for %i3 = 0 to 128 {
+///           %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
+///           affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
+///         }
+///       }
+///     }
+///     return
+///   }
+///
+/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
+/// innermost loops;
+/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
+/// loops;
+/// loops = {{%i2}}, to vectorize only the first innermost loop;
+/// loops = {{%i3}}, to vectorize only the second innermost loop;
+/// loops = {{%i1}}, to vectorize only the middle loop.
+LogicalResult
+vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
+                        const VectorizationStrategy &strategy) {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+  verifyLoopNesting(loops);
+  return vectorizeLoopNest(loops, strategy);
+}
+
+std::unique_ptr<OperationPass<FuncOp>>
+createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
+  return std::make_unique<Vectorize>(virtualVectorSize);
+}
+std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
+  return std::make_unique<Vectorize>();
+}
+
 } // namespace mlir

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
index 66429907205e..ca496b75432c 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
 
 // Permutation maps used in vectorization.
-// CHECK: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
+// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
+// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
 
 #map0 = affine_map<(d0) -> (d0)>
 #mapadd1 = affine_map<(d0) -> (d0 + 1)>
@@ -26,8 +27,8 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %P = dim %B, %c2 : memref<?x?x?xf32>
 
 // CHECK: for {{.*}} step 128
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
 // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
@@ -331,8 +332,8 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
+// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK:     %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
@@ -360,8 +361,8 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK:      %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:      %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
+// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
 // CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
index f2cd769a7cc1..0cf945ee8199 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
@@ -124,7 +124,7 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
   }
   //      VECT:  affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:    affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
-  // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[$map_id1]](%[[K]]) {
+  // VECT-NEXT:      affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
   //      VECT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
   //      VECT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>