[Mlir-commits] [mlir] 0f24163 - [mlir] Replace vector-to-scf with progressive-vector-to-scf

Thu May 13 07:27:40 PDT 2021

Author: Matthias Springer
Date: 2021-05-13T23:27:31+09:00
New Revision: 0f24163870e1a633c1d79377fdd188fe03769dd8

URL: https://github.com/llvm/llvm-project/commit/0f24163870e1a633c1d79377fdd188fe03769dd8
DIFF: https://github.com/llvm/llvm-project/commit/0f24163870e1a633c1d79377fdd188fe03769dd8.diff

LOG: [mlir] Replace vector-to-scf with progressive-vector-to-scf

Depends On D102388

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D102101

Added: 
    

Modified: 
    mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
    mlir/lib/Conversion/VectorToSCF/CMakeLists.txt
    mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
    mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir
    mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
    mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
    mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
    mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
    mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir
    mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
    mlir/test/lib/Transforms/TestVectorTransforms.cpp

Removed: 
    mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h
    mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp
    mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir


################################################################################
diff  --git a/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h b/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h
deleted file mode 100644
index b69ec01a0d5c6..0000000000000

--- a/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===- ProgressiveVectorToSCF.h - Convert vector to SCF dialect -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_
-#define MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_
-
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir {
-class MLIRContext;
-class Pass;
-class RewritePatternSet;
-
-/// When lowering an N-d vector transfer op to an (N-1)-d vector transfer op,
-/// a temporary buffer is created through which individual (N-1)-d vector are
-/// staged. This pattern can be applied multiple time, until the transfer op
-/// is 1-d.
-/// This is consistent with the lack of an LLVM instruction to dynamically
-/// index into an aggregate (see the Vector dialect lowering to LLVM deep dive).
-///
-/// An instruction such as:
-/// ```
-///    vector.transfer_write %vec, %A[%a, %b, %c] :
-///      vector<9x17x15xf32>, memref<?x?x?xf32>
-/// ```
-/// Lowers to pseudo-IR resembling (unpacking one dimension):
-/// ```
-///    %0 = alloca() : memref<vector<9x17x15xf32>>
-///    store %vec, %0[] : memref<vector<9x17x15xf32>>
-///    %1 = vector.type_cast %0 :
-///      memref<vector<9x17x15xf32>> to memref<9xvector<17x15xf32>>
-///    affine.for %I = 0 to 9 {
-///      %dim = dim %A, 0 : memref<?x?x?xf32>
-///      %add = affine.apply %I + %a
-///      %cmp = cmpi "slt", %add, %dim : index
-///      scf.if %cmp {
-///        %vec_2d = load %1[%I] : memref<9xvector<17x15xf32>>
-///        vector.transfer_write %vec_2d, %A[%add, %b, %c] :
-///          vector<17x15xf32>, memref<?x?x?xf32>
-/// ```
-///
-/// When applying the pattern a second time, the existing alloca() operation
-/// is reused and only a second vector.type_cast is added.
-
-struct ProgressiveVectorTransferToSCFOptions {
-  bool unroll = false;
-  ProgressiveVectorTransferToSCFOptions &setUnroll(bool u) {
-    unroll = u;
-    return *this;
-  }
-};
-
-/// Collect a set of patterns to convert from the Vector dialect to SCF + std.
-void populateProgressiveVectorToSCFConversionPatterns(
-    RewritePatternSet &patterns,
-    const ProgressiveVectorTransferToSCFOptions &options =
-        ProgressiveVectorTransferToSCFOptions());
-
-/// Create a pass to convert a subset of vector ops to SCF.
-std::unique_ptr<Pass> createProgressiveConvertVectorToSCFPass(
-    const ProgressiveVectorTransferToSCFOptions &options =
-        ProgressiveVectorTransferToSCFOptions());
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_

diff  --git a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
index e8c7e651cc860..5a42b9a070f84 100644
--- a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
+++ b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
@@ -1,4 +1,4 @@
-//===- VectorToSCF.h - Utils to convert from the vector dialect -*- C++ -*-===//
+//===- VectorToSCF.h - Convert vector to SCF dialect ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,57 +15,38 @@ namespace mlir {
 class MLIRContext;
 class Pass;
 class RewritePatternSet;
-using OwningRewritePatternList = RewritePatternSet;
 
-/// Control whether unrolling is used when lowering vector transfer ops to SCF.
+/// When lowering an N-d vector transfer op to an (N-1)-d vector transfer op,
+/// a temporary buffer is created through which individual (N-1)-d vector are
+/// staged. This pattern can be applied multiple time, until the transfer op
+/// is 1-d.
+/// This is consistent with the lack of an LLVM instruction to dynamically
+/// index into an aggregate (see the Vector dialect lowering to LLVM deep dive).
 ///
-/// Case 1:
-/// =======
-/// When `unroll` is false, a temporary buffer is created through which
-/// individual 1-D vector are staged. this is consistent with the lack of an
-/// LLVM instruction to dynamically index into an aggregate (see the Vector
-/// dialect lowering to LLVM deep dive).
 /// An instruction such as:
 /// ```
-///    vector.transfer_write %vec, %A[%base, %base] :
-///      vector<17x15xf32>, memref<?x?xf32>
+///    vector.transfer_write %vec, %A[%a, %b, %c] :
+///      vector<9x17x15xf32>, memref<?x?x?xf32>
 /// ```
-/// Lowers to pseudo-IR resembling:
+/// Lowers to pseudo-IR resembling (unpacking one dimension):
 /// ```
-///    %0 = alloc() : memref<17xvector<15xf32>>
+///    %0 = alloca() : memref<vector<9x17x15xf32>>
+///    store %vec, %0[] : memref<vector<9x17x15xf32>>
 ///    %1 = vector.type_cast %0 :
-///      memref<17xvector<15xf32>> to memref<vector<17x15xf32>>
-///    store %vec, %1[] : memref<vector<17x15xf32>>
-///    %dim = dim %A, 0 : memref<?x?xf32>
-///    affine.for %I = 0 to 17 {
-///      %add = affine.apply %I + %base
+///      memref<vector<9x17x15xf32>> to memref<9xvector<17x15xf32>>
+///    affine.for %I = 0 to 9 {
+///      %dim = dim %A, 0 : memref<?x?x?xf32>
+///      %add = affine.apply %I + %a
 ///      %cmp = cmpi "slt", %add, %dim : index
 ///      scf.if %cmp {
-///        %vec_1d = load %0[%I] : memref<17xvector<15xf32>>
-///        vector.transfer_write %vec_1d, %A[%add, %base] :
-///          vector<15xf32>, memref<?x?xf32>
+///        %vec_2d = load %1[%I] : memref<9xvector<17x15xf32>>
+///        vector.transfer_write %vec_2d, %A[%add, %b, %c] :
+///          vector<17x15xf32>, memref<?x?x?xf32>
 /// ```
 ///
-/// Case 2:
-/// =======
-/// When `unroll` is true, the temporary buffer is skipped and static indices
-/// into aggregates can be used (see the Vector dialect lowering to LLVM deep
-/// dive).
-/// An instruction such as:
-/// ```
-///    vector.transfer_write %vec, %A[%base, %base] :
-///      vector<3x15xf32>, memref<?x?xf32>
-/// ```
-/// Lowers to pseudo-IR resembling:
-/// ```
-///    %0 = vector.extract %arg2[0] : vector<3x15xf32>
-///    vector.transfer_write %0, %arg0[%arg1, %arg1] : vector<15xf32>,
-///    memref<?x?xf32> %1 = affine.apply #map1()[%arg1] %2 = vector.extract
-///    %arg2[1] : vector<3x15xf32> vector.transfer_write %2, %arg0[%1, %arg1] :
-///    vector<15xf32>, memref<?x?xf32> %3 = affine.apply #map2()[%arg1] %4 =
-///    vector.extract %arg2[2] : vector<3x15xf32> vector.transfer_write %4,
-///    %arg0[%3, %arg1] : vector<15xf32>, memref<?x?xf32>
-/// ```
+/// When applying the pattern a second time, the existing alloca() operation
+/// is reused and only a second vector.type_cast is added.
+
 struct VectorTransferToSCFOptions {
   bool unroll = false;
   VectorTransferToSCFOptions &setUnroll(bool u) {
@@ -74,93 +55,6 @@ struct VectorTransferToSCFOptions {
   }
 };
 
-/// Implements lowering of TransferReadOp and TransferWriteOp to a
-/// proper abstraction for the hardware.
-///
-/// There are multiple cases.
-///
-/// Case A: Permutation Map does not permute or broadcast.
-/// ======================================================
-///
-/// Progressive lowering occurs to 1-D vector transfer ops according to the
-/// description in `VectorTransferToSCFOptions`.
-///
-/// Case B: Permutation Map permutes and/or broadcasts.
-/// ======================================================
-///
-/// This path will be progressively deprecated and folded into the case above by
-/// using vector broadcast and transpose operations.
-///
-/// This path only emits a simple loop nest that performs clipped pointwise
-/// copies from a remote to a locally allocated memory.
-///
-/// Consider the case:
-///
-/// ```mlir
-///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
-///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
-///    %f0 = constant 0.0f : f32
-///    scf.for %i0 = 0 to %0 {
-///      scf.for %i1 = 0 to %1 step %c256 {
-///        scf.for %i2 = 0 to %2 step %c32 {
-///          %v = vector.transfer_read %A[%i0, %i1, %i2], %f0
-///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
-///               memref<?x?x?xf32>, vector<32x256xf32>
-///    }}}
-/// ```
-///
-/// The rewriters construct loop and indices that access MemRef A in a pattern
-/// resembling the following (while guaranteeing an always full-tile
-/// abstraction):
-///
-/// ```mlir
-///    scf.for %d2 = 0 to %c256 {
-///      scf.for %d1 = 0 to %c32 {
-///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
-///        %tmp[%d2, %d1] = %s
-///      }
-///    }
-/// ```
-///
-/// In the current state, only a clipping transfer is implemented by `clip`,
-/// which creates individual indexing expressions of the form:
-///
-/// ```mlir-dsc
-///    auto condMax = i + ii < N;
-///    auto max = std_select(condMax, i + ii, N - one)
-///    auto cond = i + ii < zero;
-///    std_select(cond, zero, max);
-/// ```
-///
-/// In the future, clipping should not be the only way and instead we should
-/// load vectors + mask them. Similarly on the write side, load/mask/store for
-/// implementing RMW behavior.
-///
-/// Lowers TransferOp into a combination of:
-///   1. local memory allocation;
-///   2. perfect loop nest over:
-///      a. scalar load/stores from local buffers (viewed as a scalar memref);
-///      a. scalar store/load to original memref (with clipping).
-///   3. vector_load/store
-///   4. local memory deallocation.
-/// Minor variations occur depending on whether a TransferReadOp or
-/// a TransferWriteOp is rewritten.
-template <typename TransferOpTy>
-struct VectorTransferRewriter : public RewritePattern {
-  explicit VectorTransferRewriter(VectorTransferToSCFOptions options,
-                                  MLIRContext *context);
-
-  /// Used for staging the transfer in a local buffer.
-  MemRefType tmpMemRefType(TransferOpTy transfer) const;
-
-  /// Performs the rewrite.
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override;
-
-  /// See description of `VectorTransferToSCFOptions`.
-  VectorTransferToSCFOptions options;
-};
-
 /// Collect a set of patterns to convert from the Vector dialect to SCF + std.
 void populateVectorToSCFConversionPatterns(
     RewritePatternSet &patterns,

diff  --git a/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt b/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt
index 1e61aa924c3e9..2a7ee5ea8a58d 100644
--- a/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt
+++ b/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_mlir_conversion_library(MLIRVectorToSCF
-  ProgressiveVectorToSCF.cpp
   VectorToSCF.cpp
 
   ADDITIONAL_HEADER_DIRS

diff  --git a/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp
deleted file mode 100644
index 981322899a2ee..0000000000000
--- a/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp
+++ /dev/null
@@ -1,1142 +0,0 @@
-//===- ProgressiveVectorToSCF.h - Convert vector to SCF dialect -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements lowering of vector transfer operations to SCF.
-//
-//===----------------------------------------------------------------------===//
-
-#include <type_traits>
-
-#include "mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h"
-
-#include "../PassDetail.h"
-#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
-#include "mlir/Dialect/MemRef/EDSC/Intrinsics.h"
-#include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
-#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
-#include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorUtils.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Passes.h"
-
-using namespace mlir;
-using namespace mlir::edsc;
-using namespace mlir::edsc::intrinsics;
-using vector::TransferReadOp;
-using vector::TransferWriteOp;
-
-namespace {
-
-/// Attribute name used for labeling transfer ops during progressive lowering.
-static const char kPassLabel[] = "__vector_to_scf_lowering__";
-
-/// Lower to 1D transfer ops. Target-specific lowering will lower those.
-static const int64_t kTargetRank = 1;
-
-/// Given a MemRefType with VectorType element type, unpack one dimension from
-/// the VectorType into the MemRefType.
-///
-/// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
-static MemRefType unpackOneDim(MemRefType type) {
-  auto vectorType = type.getElementType().dyn_cast<VectorType>();
-  auto memrefShape = type.getShape();
-  SmallVector<int64_t, 8> newMemrefShape;
-  newMemrefShape.append(memrefShape.begin(), memrefShape.end());
-  newMemrefShape.push_back(vectorType.getDimSize(0));
-  return MemRefType::get(newMemrefShape,
-                         VectorType::get(vectorType.getShape().drop_front(),
-                                         vectorType.getElementType()));
-}
-
-/// Helper data structure for data and mask buffers.
-struct BufferAllocs {
-  Value dataBuffer;
-  Value maskBuffer;
-};
-
-/// Allocate temporary buffers for data (vector) and mask (if present).
-/// TODO: Parallelism and threadlocal considerations.
-template <typename OpTy>
-static BufferAllocs allocBuffers(OpTy xferOp) {
-  auto &b = ScopedContext::getBuilderRef();
-  OpBuilder::InsertionGuard guard(b);
-  Operation *scope =
-      xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
-  assert(scope && "Expected op to be inside automatic allocation scope");
-  b.setInsertionPointToStart(&scope->getRegion(0).front());
-
-  BufferAllocs result;
-  auto bufferType = MemRefType::get({}, xferOp.getVectorType());
-  result.dataBuffer = memref_alloca(bufferType).value;
-
-  if (xferOp.mask()) {
-    auto maskType = MemRefType::get({}, xferOp.mask().getType());
-    Value maskBuffer = memref_alloca(maskType);
-    memref_store(xferOp.mask(), maskBuffer);
-    result.maskBuffer = memref_load(maskBuffer);
-  }
-
-  return result;
-}
-
-/// Given a vector transfer op, calculate which dimension of the `source`
-/// memref should be unpacked in the next application of TransferOpConversion.
-/// A return value of None indicates a broadcast.
-template <typename OpTy>
-static Optional<int64_t> unpackedDim(OpTy xferOp) {
-  auto map = xferOp.permutation_map();
-  if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
-    return expr.getPosition();
-  }
-  assert(xferOp.isBroadcastDim(0) &&
-         "Expected AffineDimExpr or AffineConstantExpr");
-  return None;
-}
-
-/// Compute the permutation map for the new (N-1)-D vector transfer op. This
-/// map is identical to the current permutation map, but the first result is
-/// omitted.
-template <typename OpTy>
-static AffineMap unpackedPermutationMap(OpTy xferOp, OpBuilder &builder) {
-  auto map = xferOp.permutation_map();
-  return AffineMap::get(
-      map.getNumDims(), 0, map.getResults().drop_front(),
-      builder.getContext());
-}
-
-/// Calculate the indices for the new vector transfer op.
-///
-/// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
-///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
-///                                 ^^^^^^
-///              `iv` is the iteration variable of the (new) surrounding loop.
-template <typename OpTy>
-static void getXferIndices(OpTy xferOp, Value iv,
-                           SmallVector<Value, 8> &indices) {
-  typename OpTy::Adaptor adaptor(xferOp);
-  // Corresponding memref dim of the vector dim that is unpacked.
-  auto dim = unpackedDim(xferOp);
-  auto prevIndices = adaptor.indices();
-  indices.append(prevIndices.begin(), prevIndices.end());
-
-  bool isBroadcast = !dim.hasValue();
-  if (!isBroadcast) {
-    using edsc::op::operator+;
-    indices[dim.getValue()] = adaptor.indices()[dim.getValue()] + iv;
-  }
-}
-
-static void maybeYieldValue(
-    bool hasRetVal, OpBuilder builder, Location loc, Value value) {
-  if (hasRetVal) {
-    builder.create<scf::YieldOp>(loc, value);
-  } else {
-    builder.create<scf::YieldOp>(loc);
-  }
-}
-
-/// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
-/// is set to true. No such check is generated under following circumstances:
-/// * xferOp does not have a mask.
-/// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
-///   computed and attached to the new transfer op in the pattern.)
-/// * The to-be-unpacked dim of xferOp is a broadcast.
-template <typename OpTy>
-static Value generateMaskCheck(OpBuilder &builder, OpTy xferOp, Value iv) {
-  if (!xferOp.mask())
-    return Value();
-  if (xferOp.getMaskType().getRank() != 1)
-    return Value();
-  if (xferOp.isBroadcastDim(0))
-    return Value();
-
-  auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv);
-  return vector_extract_element(xferOp.mask(), ivI32).value;
-}
-
-/// Helper function TransferOpConversion and TransferOp1dConversion.
-/// Generate an in-bounds check if the transfer op may go out-of-bounds on the
-/// specified dimension `dim` with the loop iteration variable `iv`.
-/// E.g., when unpacking dimension 0 from:
-/// ```
-/// %vec = vector.transfer_read %A[%a, %b] %cst
-///     : vector<5x4xf32>, memref<?x?xf32>
-/// ```
-/// An if check similar to this will be generated inside the loop:
-/// ```
-/// %d = memref.dim %A, %c0 : memref<?x?xf32>
-/// if (%a + iv < %d) {
-///   (in-bounds case)
-/// } else {
-///   (out-of-bounds case)
-/// }
-/// ```
-///
-/// If the transfer is 1D and has a mask, this function generates a more complex
-/// check also accounts for potentially masked out elements.
-///
-/// This function variant returns the value returned by `inBoundsCase` or
-/// `outOfBoundsCase`. The MLIR type of the return value must be specified in
-/// `resultTypes`.
-template <typename OpTy>
-static Value generateInBoundsCheck(
-    OpTy xferOp, Value iv, OpBuilder &builder, Optional<int64_t> dim,
-    TypeRange resultTypes,
-    function_ref<Value(OpBuilder &, Location)> inBoundsCase,
-    function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
-  bool hasRetVal = !resultTypes.empty();
-  Value cond; // Condition to be built...
-
-  // Condition check 1: Access in-bounds?
-  bool isBroadcast = !dim.hasValue();  // No in-bounds check for broadcasts.
-  if (!xferOp.isDimInBounds(0) && !isBroadcast) {
-    auto memrefDim =
-        memref_dim(xferOp.source(), std_constant_index(dim.getValue()));
-    using edsc::op::operator+;
-    auto memrefIdx = xferOp.indices()[dim.getValue()] + iv;
-    cond = std_cmpi_sgt(memrefDim.value, memrefIdx);
-  }
-
-  // Condition check 2: Masked in?
-  if (auto maskCond = generateMaskCheck(builder, xferOp, iv)) {
-    if (cond) {
-      cond = builder.create<AndOp>(xferOp.getLoc(), cond, maskCond);
-    } else {
-      cond = maskCond;
-    }
-  }
-
-  // If the condition is non-empty, generate an SCF::IfOp.
-  if (cond) {
-    auto check = builder.create<scf::IfOp>(
-        xferOp.getLoc(), resultTypes, cond,
-        /*thenBuilder=*/[&](OpBuilder &builder, Location loc) {
-      maybeYieldValue(hasRetVal, builder, loc, inBoundsCase(builder, loc));
-    }, /*elseBuilder=*/[&](OpBuilder &builder, Location loc) {
-      if (outOfBoundsCase) {
-        maybeYieldValue(hasRetVal, builder, loc, outOfBoundsCase(builder, loc));
-      } else {
-        builder.create<scf::YieldOp>(loc);
-      }
-    });
-
-    return hasRetVal ? check.getResult(0) : Value();
-  }
-
-  // Condition is empty, no need for an SCF::IfOp.
-  return inBoundsCase(builder, xferOp.getLoc());
-}
-
-/// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
-/// a return value. Consequently, this function does not have a return value.
-template <typename OpTy>
-static void generateInBoundsCheck(
-    OpTy xferOp, Value iv, OpBuilder &builder, Optional<int64_t> dim,
-    function_ref<void(OpBuilder &, Location)> inBoundsCase,
-    function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
-  generateInBoundsCheck(
-      xferOp, iv, builder, dim, /*resultTypes=*/TypeRange(),
-      /*inBoundsCase=*/[&](OpBuilder &builder, Location loc) {
-        inBoundsCase(builder, loc);
-        return Value();
-      },
-      /*outOfBoundsCase=*/[&](OpBuilder &builder, Location loc) {
-        if (outOfBoundsCase)
-            outOfBoundsCase(builder, loc);
-        return Value();
-      });
-}
-
-/// Given an ArrayAttr, return a copy where the first element is dropped.
-static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) {
-  if (!attr)
-      return attr;
-  return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front());
-}
-
-/// Add the pass label to a vector transfer op if its rank is not the target
-/// rank.
-template <typename OpTy>
-static void maybeApplyPassLabel(OpBuilder &builder, OpTy newXferOp) {
-  if (newXferOp.getVectorType().getRank() > kTargetRank)
-    newXferOp->setAttr(kPassLabel, builder.getUnitAttr());
-}
-
-/// Given a transfer op, find the memref from which the mask is loaded. This
-/// is similar to Strategy<TransferWriteOp>::getBuffer.
-template <typename OpTy>
-static Value getMaskBuffer(OpTy xferOp) {
-  assert(xferOp.mask() && "Expected that transfer op has mask");
-  auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
-  assert(loadOp && "Expected transfer op mask produced by LoadOp");
-  return loadOp.getMemRef();
-}
-
-/// Codegen strategy, depending on the operation.
-template <typename OpTy>
-struct Strategy;
-
-/// Code strategy for vector TransferReadOp.
-template<>
-struct Strategy<TransferReadOp> {
-  /// Find the StoreOp that is used for writing the current TransferReadOp's
-  /// result to the temporary buffer allocation.
-  static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
-    assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
-    auto storeOp = dyn_cast<memref::StoreOp>(
-        (*xferOp->use_begin()).getOwner());
-    assert(storeOp && "Expected TransferReadOp result used by StoreOp");
-    return storeOp;
-  }
-
-  /// Find the temporary buffer allocation. All labeled TransferReadOps are
-  /// used like this, where %buf is either the buffer allocation or a type cast
-  /// of the buffer allocation:
-  /// ```
-  /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
-  /// memref.store %vec, %buf[...] ...
-  /// ```
-  static Value getBuffer(TransferReadOp xferOp) {
-    return getStoreOp(xferOp).getMemRef();
-  }
-
-  /// Retrieve the indices of the current StoreOp that stores into the buffer.
-  static void getBufferIndices(TransferReadOp xferOp,
-                               SmallVector<Value, 8> &indices) {
-    auto storeOp = getStoreOp(xferOp);
-    auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
-    indices.append(prevIndices.begin(), prevIndices.end());
-  }
-
-  /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
-  /// accesses on the to-be-unpacked dimension.
-  ///
-  /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
-  ///    variable `iv`.
-  /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
-  ///
-  /// E.g.:
-  /// ```
-  /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
-  ///     : memref<?x?x?xf32>, vector<4x3xf32>
-  /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
-  /// ```
-  /// Is rewritten to:
-  /// ```
-  /// %casted = vector.type_cast %buf
-  ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
-  /// for %j = 0 to 4 {
-  ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
-  ///       : memref<?x?x?xf32>, vector<3xf32>
-  ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
-  /// }
-  /// ```
-  ///
-  /// Note: The loop and type cast are generated in TransferOpConversion.
-  ///       The original TransferReadOp and store op are deleted in `cleanup`.
-  /// Note: The `mask` operand is set in TransferOpConversion.
-  static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
-                                  Value buffer, Value iv) {
-    SmallVector<Value, 8> storeIndices;
-    getBufferIndices(xferOp, storeIndices);
-    storeIndices.push_back(iv);
-
-    SmallVector<Value, 8> xferIndices;
-    getXferIndices(xferOp, iv, xferIndices);
-
-    auto bufferType = buffer.getType().dyn_cast<ShapedType>();
-    auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
-    auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr());
-    auto newXfer = vector_transfer_read(
-        vecType, xferOp.source(), xferIndices,
-        AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)),
-        xferOp.padding(), Value(), inBoundsAttr).value;
-
-    maybeApplyPassLabel(builder,
-                        dyn_cast<TransferReadOp>(newXfer.getDefiningOp()));
-
-    memref_store(newXfer, buffer, storeIndices);
-    return newXfer.getDefiningOp<TransferReadOp>();
-  }
-
-  /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
-  /// padding value to the temporary buffer.
-  static void handleOutOfBoundsDim(
-      OpBuilder &/*builder*/, TransferReadOp xferOp, Value buffer,
-      Value iv) {
-    SmallVector<Value, 8> storeIndices;
-    getBufferIndices(xferOp, storeIndices);
-    storeIndices.push_back(iv);
-
-    auto bufferType = buffer.getType().dyn_cast<ShapedType>();
-    auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
-    auto vec = std_splat(vecType, xferOp.padding());
-    memref_store(vec, buffer, storeIndices);
-  }
-
-  /// Cleanup after rewriting the op.
-  static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) {
-    rewriter.eraseOp(getStoreOp(xferOp));
-    rewriter.eraseOp(xferOp);
-  }
-};
-
-/// Codegen strategy for vector TransferWriteOp.
-template<>
-struct Strategy<TransferWriteOp> {
-  /// Find the temporary buffer allocation. All labeled TransferWriteOps are
-  /// used like this, where %buf is either the buffer allocation or a type cast
-  /// of the buffer allocation:
-  /// ```
-  /// %vec = memref.load %buf[...] ...
-  /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
-  /// ```
-  static Value getBuffer(TransferWriteOp xferOp) {
-    auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
-    assert(loadOp && "Expected transfer op vector produced by LoadOp");
-    return loadOp.getMemRef();
-  }
-
-  /// Retrieve the indices of the current LoadOp that loads from the buffer.
-  static void getBufferIndices(TransferWriteOp xferOp,
-                               SmallVector<Value, 8> &indices) {
-    auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
-    auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
-    indices.append(prevIndices.begin(), prevIndices.end());
-  }
-
-  /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
-  /// accesses on the to-be-unpacked dimension.
-  ///
-  /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
-  ///    using the loop iteration variable `iv`.
-  /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
-  ///    to memory.
-  ///
-  /// Note: For more details, see comments on Strategy<TransferReadOp>.
-  static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
-                                   Value buffer, Value iv) {
-    SmallVector<Value, 8> loadIndices;
-    getBufferIndices(xferOp, loadIndices);
-    loadIndices.push_back(iv);
-
-    SmallVector<Value, 8> xferIndices;
-    getXferIndices(xferOp, iv, xferIndices);
-
-    auto vec = memref_load(buffer, loadIndices);
-    auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr());
-    auto newXfer = vector_transfer_write(
-        Type(), vec, xferOp.source(), xferIndices,
-        AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)),
-        Value(), inBoundsAttr);
-
-    maybeApplyPassLabel(builder, newXfer.op);
-
-    return newXfer;
-  }
-
-  /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
-  static void handleOutOfBoundsDim(
-      OpBuilder &builder, TransferWriteOp xferOp, Value buffer,
-      Value iv) {}
-
-  /// Cleanup after rewriting the op.
-  static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) {
-    rewriter.eraseOp(xferOp);
-  }
-};
-
-template <typename OpTy>
-LogicalResult checkPrepareXferOp(OpTy xferOp) {
-  if (xferOp->hasAttr(kPassLabel))
-      return failure();
-  if (xferOp.getVectorType().getRank() <= kTargetRank)
-      return failure();
-  return success();
-}
-
-/// Prepare a TransferReadOp for progressive lowering.
-///
-/// 1. Allocate a temporary buffer.
-/// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
-/// 3. Store the result of the TransferReadOp into the temporary buffer.
-/// 4. Load the result from the temporary buffer and replace all uses of the
-///    original TransferReadOp with this load.
-///
-/// E.g.:
-/// ```
-/// %vec = vector.transfer_read %A[%a, %b, %c], %cst
-///     : vector<5x4xf32>, memref<?x?x?xf32>
-/// ```
-/// is rewritten to:
-/// ```
-/// %0 = memref.alloca() : memref<vector<5x4xf32>>
-/// %1 = vector.transfer_read %A[%a, %b, %c], %cst
-///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
-/// memref.store %1, %0[] : memref<vector<5x4xf32>>
-/// %vec = memref.load %0[] : memref<vector<5x4xf32>>
-/// ```
-///
-/// Note: A second temporary buffer may be allocated for the `mask` operand.
-struct PrepareTransferReadConversion
-    : public OpRewritePattern<TransferReadOp> {
-  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TransferReadOp xferOp,
-                                PatternRewriter &rewriter) const override {
-    if (checkPrepareXferOp(xferOp).failed())
-      return failure();
-
-    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto buffers = allocBuffers(xferOp);
-    auto *newXfer = rewriter.clone(*xferOp.getOperation());
-    newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
-    if (xferOp.mask()) {
-      dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(
-          buffers.maskBuffer);
-    }
-
-    memref_store(newXfer->getResult(0), buffers.dataBuffer);
-    rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
-
-    return success();
-  }
-};
-
-/// Prepare a TransferWriteOp for progressive lowering.
-///
-/// 1. Allocate a temporary buffer.
-/// 2. Store the vector into the buffer.
-/// 3. Load the vector from the buffer again.
-/// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
-///    marking it eligible for progressive lowering via TransferOpConversion.
-///
-/// E.g.:
-/// ```
-/// vector.transfer_write %vec, %A[%a, %b, %c]
-///     : vector<5x4xf32>, memref<?x?x?xf32>
-/// ```
-/// is rewritten to:
-/// ```
-/// %0 = memref.alloca() : memref<vector<5x4xf32>>
-/// memref.store %vec, %0[] : memref<vector<5x4xf32>>
-/// %1 = memref.load %0[] : memref<vector<5x4xf32>>
-/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
-///     : vector<5x4xf32>, memref<?x?x?xf32>
-/// ```
-///
-/// Note: A second temporary buffer may be allocated for the `mask` operand.
-struct PrepareTransferWriteConversion
-    : public OpRewritePattern<TransferWriteOp> {
-  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TransferWriteOp xferOp,
-                                PatternRewriter &rewriter) const override {
-    if (checkPrepareXferOp(xferOp).failed())
-      return failure();
-
-    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto buffers = allocBuffers(xferOp);
-    memref_store(xferOp.vector(), buffers.dataBuffer);
-    auto loadedVec = memref_load(buffers.dataBuffer);
-    rewriter.updateRootInPlace(xferOp, [&]() {
-      xferOp.vectorMutable().assign(loadedVec);
-      xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
-    });
-
-    if (xferOp.mask()) {
-      rewriter.updateRootInPlace(
-          xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); });
-    }
-
-    return success();
-  }
-};
-
-/// Progressive lowering of vector transfer ops: Unpack one dimension.
-///
-/// 1. Unpack one dimension from the current buffer type and cast the buffer
-///    to that new type. E.g.:
-///    ```
-///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
-///    vector.transfer_write %vec ...
-///    ```
-///    The following cast is generated:
-///    ```
-///    %casted = vector.type_cast %0
-///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
-///    ```
-/// 2. Generate a for loop and rewrite the transfer op according to the
-///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
-///    out-of-bounds, generate an if-check and handle both cases separately.
-/// 3. Clean up according to the corresponding Strategy<OpTy>.
-template <typename OpTy>
-struct TransferOpConversion : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy xferOp,
-                                PatternRewriter &rewriter) const override {
-    if (!xferOp->hasAttr(kPassLabel))
-        return failure();
-
-    ScopedContext scope(rewriter, xferOp.getLoc());
-
-    // Find and cast data buffer. How the buffer can be found depends on OpTy.
-    auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
-    auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
-    auto castedDataType = unpackOneDim(dataBufferType);
-    auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer);
-
-    // If the xferOp has a mask: Find and cast mask buffer.
-    Value castedMaskBuffer;
-    if (xferOp.mask()) {
-      auto maskBuffer = getMaskBuffer(xferOp);
-      auto maskBufferType =
-          maskBuffer.getType().template dyn_cast<MemRefType>();
-      if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
-        // Do not unpack a dimension of the mask, if:
-        // * To-be-unpacked transfer op dimension is a broadcast.
-        // * Mask is 1D, i.e., the mask cannot be further unpacked.
-        //   (That means that all remaining dimensions of the transfer op must
-        //   be broadcasted.)
-        castedMaskBuffer = maskBuffer;
-      } else {
-        auto castedMaskType = unpackOneDim(maskBufferType);
-        castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer);
-      }
-    }
-
-    // Loop bounds and step.
-    auto lb = std_constant_index(0).value;
-    auto ub = std_constant_index(
-                  castedDataType.getDimSize(castedDataType.getRank() - 1))
-                  .value;
-    auto step = std_constant_index(1).value;
-
-    // Generate for loop.
-    rewriter.create<scf::ForOp>(
-        xferOp.getLoc(), lb, ub, step, ValueRange(),
-        [&](OpBuilder &b, Location loc, Value iv,
-            ValueRange /*loopState*/) {
-      ScopedContext scope(b, loc);
-      generateInBoundsCheck(
-          xferOp, iv, b, unpackedDim(xferOp),
-          /*inBoundsCase=*/
-          [&](OpBuilder &b, Location /*loc*/) {
-            // Create new transfer op.
-            OpTy newXfer =
-                Strategy<OpTy>::rewriteOp(b, xferOp, castedDataBuffer, iv);
-
-            // If old transfer op has a mask: Set mask on new transfer op.
-            // Special case: If the mask of the old transfer op is 1D and the
-            //               unpacked dim is not a broadcast, no mask is needed
-            //               on the new transfer op.
-            if (xferOp.mask() && (xferOp.isBroadcastDim(0) ||
-                                  xferOp.getMaskType().getRank() > 1)) {
-              OpBuilder::InsertionGuard guard(b);
-              b.setInsertionPoint(newXfer); // Insert load before newXfer.
-
-              SmallVector<Value, 8> loadIndices;
-              Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
-              // In case of broadcast: Use same indices to load from memref as
-              // before.
-              if (!xferOp.isBroadcastDim(0))
-                loadIndices.push_back(iv);
-
-              auto mask = memref_load(castedMaskBuffer, loadIndices);
-              rewriter.updateRootInPlace(
-                  newXfer, [&]() { newXfer.maskMutable().assign(mask); });
-            }
-          },
-          /*outOfBoundsCase=*/
-          [&](OpBuilder &b, Location /*loc*/) {
-            Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp, castedDataBuffer,
-                                                 iv);
-          });
-      b.create<scf::YieldOp>(loc);
-    });
-
-    Strategy<OpTy>::cleanup(rewriter, xferOp);
-    return success();
-  }
-};
-
-/// If the original transfer op has a mask, compute the mask of the new transfer
-/// op (for the current iteration `i`) and assign it.
-template <typename OpTy>
-static void maybeAssignMask(OpBuilder &builder, OpTy xferOp, OpTy newXferOp,
-                            int64_t i) {
-  if (!xferOp.mask())
-    return;
-
-  if (xferOp.isBroadcastDim(0)) {
-    // To-be-unpacked dimension is a broadcast, which does not have a
-    // corresponding mask dimension. Mask attribute remains unchanged.
-    newXferOp.maskMutable().assign(xferOp.mask());
-    return;
-  }
-
-  if (xferOp.getMaskType().getRank() > 1) {
-    // Unpack one dimension of the mask.
-    OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPoint(newXferOp); // Insert load before newXfer.
-
-    llvm::SmallVector<int64_t, 1> indices({i});
-    auto newMask = vector_extract(xferOp.mask(), indices).value;
-    newXferOp.maskMutable().assign(newMask);
-  }
-
-  // If we end up here: The mask of the old transfer op is 1D and the unpacked
-  // dim is not a broadcast, so no mask is needed on the new transfer op.
-  // `generateInBoundsCheck` will have evaluated the mask already.
-}
-
-/// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
-/// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
-/// memref buffer is allocated and the SCF loop is fully unrolled.
-///
-/// ```
-/// E.g.:
-/// ```
-/// %vec = vector.transfer_read %A[%a, %b, %c], %padding
-///     : memref<?x?x?xf32>, vector<5x4xf32>
-/// ```
-/// is rewritten to IR such as (simplified):
-/// ```
-/// %v_init = splat %padding : vector<5x4xf32>
-/// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
-///     : memref<?x?x?xf32>, vector<4xf32>
-/// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
-/// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
-///     : memref<?x?x?xf32>, vector<4xf32>
-/// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
-/// ...
-/// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
-///     : memref<?x?x?xf32>, vector<4xf32>
-/// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
-/// ```
-///
-/// Note: As an optimization, if the result of the original TransferReadOp
-/// was directly inserted into another vector, no new %v_init vector is created.
-/// Instead, the new TransferReadOp results are inserted into that vector.
-struct UnrollTransferReadConversion : public OpRewritePattern<TransferReadOp> {
-  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
-
-  /// Return the vector into which the newly created TransferReadOp results
-  /// are inserted.
-  Value getResultVector(TransferReadOp xferOp,
-                        PatternRewriter &rewriter) const {
-    if (auto insertOp = getInsertOp(xferOp))
-      return insertOp.dest();
-    return std_splat(xferOp.getVectorType(), xferOp.padding()).value;
-  }
-
-  /// If the result of the TransferReadOp has exactly one user, which is a
-  /// vector::InsertOp, return that operation.
-  vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
-    if (xferOp->hasOneUse()) {
-      Operation *xferOpUser = *xferOp->getUsers().begin();
-      if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
-        return insertOp;
-    }
-
-    return vector::InsertOp();
-  }
-
-  /// If the result of the TransferReadOp has exactly one user, which is a
-  /// vector::InsertOp, return that operation's indices.
-  void getInsertionIndices(TransferReadOp xferOp,
-                           SmallVector<int64_t, 8> &indices) const {
-    if (auto insertOp = getInsertOp(xferOp)) {
-      llvm::for_each(insertOp.position(), [&](Attribute attr) {
-        indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
-      });
-    }
-  }
-
-  /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
-  /// accesses, and broadcasts and transposes in permutation maps.
-  LogicalResult matchAndRewrite(TransferReadOp xferOp,
-                                PatternRewriter &rewriter) const override {
-    if (xferOp.getVectorType().getRank() <= kTargetRank)
-      return failure();
-
-    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto insertOp = getInsertOp(xferOp);
-    auto vec = getResultVector(xferOp, rewriter);
-    auto vecType = vec.getType().dyn_cast<VectorType>();
-    auto xferVecType = xferOp.getVectorType();
-    auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(),
-                                          xferVecType.getElementType());
-    int64_t dimSize = xferVecType.getShape()[0];
-
-    // Generate fully unrolled loop of transfer ops.
-    for (int64_t i = 0; i < dimSize; ++i) {
-      Value iv = std_constant_index(i);
-
-      vec = generateInBoundsCheck(
-          xferOp, iv, rewriter, unpackedDim(xferOp), TypeRange(vecType),
-          /*inBoundsCase=*/
-          [&](OpBuilder &b, Location loc) {
-            ScopedContext scope(b, loc);
-
-            // Indices for the new transfer op.
-            SmallVector<Value, 8> xferIndices;
-            getXferIndices(xferOp, iv, xferIndices);
-
-            // Indices for the new vector.insert op.
-            SmallVector<int64_t, 8> insertionIndices;
-            getInsertionIndices(xferOp, insertionIndices);
-            insertionIndices.push_back(i);
-
-            auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
-            auto newXferOpVal =
-                vector_transfer_read(
-                    newXferVecType, xferOp.source(), xferIndices,
-                    AffineMapAttr::get(unpackedPermutationMap(xferOp, b)),
-                    xferOp.padding(), Value(), inBoundsAttr)
-                    .value;
-            auto newXferOp =
-                dyn_cast<TransferReadOp>(newXferOpVal.getDefiningOp());
-
-            maybeAssignMask(b, xferOp, newXferOp, i);
-
-            return vector_insert(newXferOp, vec, insertionIndices).value;
-          },
-          /*outOfBoundsCase=*/
-          [&](OpBuilder &b, Location loc) {
-            // Loop through original (unmodified) vector.
-            return vec;
-          });
-    }
-
-    if (insertOp) {
-      // Rewrite single user of the old TransferReadOp, which was an InsertOp.
-      rewriter.replaceOp(insertOp, vec);
-      rewriter.eraseOp(xferOp);
-    } else {
-      rewriter.replaceOp(xferOp, vec);
-    }
-
-    return success();
-  }
-};
-
-/// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
-/// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
-/// memref buffer is allocated and the SCF loop is fully unrolled.
-///
-/// ```
-/// E.g.:
-/// ```
-/// vector.transfer_write %vec, %A[%a, %b, %c]
-///     : vector<5x4xf32>, memref<?x?x?xf32>
-/// ```
-/// is rewritten to IR such as (simplified):
-/// ```
-/// %v0 = vector.extract %vec[0] : vector<5x4xf32>
-/// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
-/// %v1 = vector.extract %vec[1] : vector<5x4xf32>
-/// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
-/// ...
-/// %v4 = vector.extract %vec[4] : vector<5x4xf32>
-/// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
-/// ```
-///
-/// Note: As an optimization, if the vector of the original TransferWriteOp
-/// was directly extracted from another vector via an ExtractOp `a`, extract
-/// the vectors for the newly generated TransferWriteOps from `a`'s input. By
-/// doing so, `a` may become dead, and the number of ExtractOps generated during
-/// recursive application of this pattern will be minimal.
-struct UnrollTransferWriteConversion
-    : public OpRewritePattern<TransferWriteOp> {
-  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
-
-  /// Return the vector from which newly generated ExtracOps will extract.
-  Value getDataVector(TransferWriteOp xferOp) const {
-    if (auto extractOp = getExtractOp(xferOp))
-      return extractOp.vector();
-    return xferOp.vector();
-  }
-
-  /// If the input of the given TransferWriteOp is an ExtractOp, return it.
-  vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
-    if (auto *op = xferOp.vector().getDefiningOp())
-      return dyn_cast<vector::ExtractOp>(op);
-    return vector::ExtractOp();
-  }
-
-  /// If the input of the given TransferWriteOp is an ExtractOp, return its
-  /// indices.
-  void getExtractionIndices(TransferWriteOp xferOp,
-                            SmallVector<int64_t, 8> &indices) const {
-    if (auto extractOp = getExtractOp(xferOp)) {
-      llvm::for_each(extractOp.position(), [&](Attribute attr) {
-        indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
-      });
-    }
-  }
-
-  /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
-  /// accesses, and broadcasts and transposes in permutation maps.
-  LogicalResult matchAndRewrite(TransferWriteOp xferOp,
-                                PatternRewriter &rewriter) const override {
-    if (xferOp.getVectorType().getRank() <= kTargetRank)
-      return failure();
-
-    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto vec = getDataVector(xferOp);
-    auto xferVecType = xferOp.getVectorType();
-    int64_t dimSize = xferVecType.getShape()[0];
-
-    // Generate fully unrolled loop of transfer ops.
-    for (int64_t i = 0; i < dimSize; ++i) {
-      Value iv = std_constant_index(i);
-
-      generateInBoundsCheck(
-          xferOp, iv, rewriter, unpackedDim(xferOp),
-          /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
-            ScopedContext scope(b, loc);
-
-            // Indices for the new transfer op.
-            SmallVector<Value, 8> xferIndices;
-            getXferIndices(xferOp, iv, xferIndices);
-
-            // Indices for the new vector.extract op.
-            SmallVector<int64_t, 8> extractionIndices;
-            getExtractionIndices(xferOp, extractionIndices);
-            extractionIndices.push_back(i);
-
-            auto extracted = vector_extract(vec, extractionIndices).value;
-            auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
-
-            auto newXferOp =
-                vector_transfer_write(
-                    Type(), extracted, xferOp.source(), xferIndices,
-                    AffineMapAttr::get(unpackedPermutationMap(xferOp, b)),
-                    Value(), inBoundsAttr)
-                    .op;
-
-            maybeAssignMask(b, xferOp, newXferOp, i);
-          });
-    }
-
-    rewriter.eraseOp(xferOp);
-    return success();
-  }
-};
-
-/// Compute the indices into the memref for the LoadOp/StoreOp generated as
-/// part of TransferOp1dConversion. Return the memref dimension on which
-/// the transfer is operating. A return value of None indicates a broadcast.
-template <typename OpTy>
-static Optional<int64_t> get1dMemrefIndices(
-    OpTy xferOp, Value iv, SmallVector<Value, 8> &memrefIndices) {
-  auto indices = xferOp.indices();
-  auto map = xferOp.permutation_map();
-
-  memrefIndices.append(indices.begin(), indices.end());
-  assert(map.getNumResults() == 1 &&
-         "Expected 1 permutation map result for 1D transfer");
-  if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
-    auto dim = expr.getPosition();
-    using edsc::op::operator+;
-    memrefIndices[dim] = memrefIndices[dim] + iv;
-    return dim;
-  }
-
-  assert(xferOp.isBroadcastDim(0) &&
-         "Expected AffineDimExpr or AffineConstantExpr");
-  return None;
-}
-
-/// Codegen strategy for TransferOp1dConversion, depending on the
-/// operation.
-template <typename OpTy>
-struct Strategy1d;
-
-/// Codegen strategy for TransferReadOp.
-template <>
-struct Strategy1d<TransferReadOp> {
-  static void generateForLoopBody(
-      OpBuilder &builder, Location loc, TransferReadOp xferOp, Value iv,
-      ValueRange loopState) {
-    SmallVector<Value, 8> indices;
-    auto dim = get1dMemrefIndices(xferOp, iv, indices);
-    auto ivI32 = std_index_cast(
-        IntegerType::get(builder.getContext(), 32), iv);
-    auto vec = loopState[0];
-
-    // In case of out-of-bounds access, leave `vec` as is (was initialized with
-    // padding value).
-    auto nextVec = generateInBoundsCheck(
-        xferOp, iv, builder, dim, TypeRange(xferOp.getVectorType()),
-        /*inBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) {
-      auto val = memref_load(xferOp.source(), indices);
-      return vector_insert_element(val, vec, ivI32.value).value;
-    }, /*outOfBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) {
-      return vec;
-    });
-    builder.create<scf::YieldOp>(loc, nextVec);
-  }
-
-  static Value initialLoopState(TransferReadOp xferOp) {
-    // Inititalize vector with padding value.
-    return std_splat(xferOp.getVectorType(), xferOp.padding()).value;
-  }
-};
-
-/// Codegen strategy for TransferWriteOp.
-template <>
-struct Strategy1d<TransferWriteOp> {
-  static void generateForLoopBody(
-      OpBuilder &builder, Location loc, TransferWriteOp xferOp, Value iv,
-      ValueRange /*loopState*/) {
-    SmallVector<Value, 8> indices;
-    auto dim = get1dMemrefIndices(xferOp, iv, indices);
-    auto ivI32 = std_index_cast(
-        IntegerType::get(builder.getContext(), 32), iv);
-
-    // Nothing to do in case of out-of-bounds access.
-    generateInBoundsCheck(
-        xferOp, iv, builder, dim,
-        /*inBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) {
-      auto val = vector_extract_element(xferOp.vector(), ivI32.value);
-      memref_store(val, xferOp.source(), indices);
-    });
-    builder.create<scf::YieldOp>(loc);
-  }
-
-  static Value initialLoopState(TransferWriteOp xferOp) {
-    return Value();
-  }
-};
-
-/// Return true if the last dimension of the MemRefType has unit stride.
-static bool isLastMemrefDimUnitStride(MemRefType type) {
-  int64_t offset;
-  SmallVector<int64_t, 4> strides;
-  auto successStrides = getStridesAndOffset(type, strides, offset);
-  return succeeded(successStrides) && strides.back() == 1;
-}
-
-/// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
-/// necessary in cases where a 1D vector transfer op cannot be lowered into
-/// vector load/stores due to non-unit strides or broadcasts:
-///
-/// * Transfer dimension is not the last memref dimension
-/// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
-/// * Memref has a layout map with non-unit stride on the last dimension
-///
-/// This pattern generates IR as follows:
-///
-/// 1. Generate a for loop iterating over each vector element.
-/// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
-///    depending on OpTy.
-///
-/// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
-///       can be generated instead of TransferOp1dConversion. Add such a pattern
-///       to ConvertVectorToLLVM.
-///
-/// E.g.:
-/// ```
-/// vector.transfer_write %vec, %A[%a, %b]
-///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
-///    : vector<9xf32>, memref<?x?xf32>
-/// ```
-/// Is rewritten to approximately the following pseudo-IR:
-/// ```
-/// for i = 0 to 9 {
-///   %t = vector.extractelement %vec[i] : vector<9xf32>
-///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
-/// }
-/// ```
-template <typename OpTy>
-struct TransferOp1dConversion : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy xferOp,
-                                PatternRewriter &rewriter) const override {
-    ScopedContext scope(rewriter, xferOp.getLoc());
-    auto map = xferOp.permutation_map();
-    auto memRefType = xferOp.getShapedType().template dyn_cast<MemRefType>();
-
-    if (!memRefType)
-      return failure();
-    if (xferOp.getVectorType().getRank() != 1)
-      return failure();
-    if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
-      return failure(); // Handled by ConvertVectorToLLVM
-
-    // Loop bounds, step, state...
-    auto vecType = xferOp.getVectorType();
-    auto lb = std_constant_index(0);
-    auto ub = std_constant_index(vecType.getDimSize(0));
-    auto step = std_constant_index(1);
-    auto loopState = Strategy1d<OpTy>::initialLoopState(xferOp);
-
-    // Generate for loop.
-    rewriter.replaceOpWithNewOp<scf::ForOp>(
-        xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
-        [&](OpBuilder &builder, Location loc, Value iv, ValueRange loopState) {
-      ScopedContext nestedScope(builder, loc);
-      Strategy1d<OpTy>::generateForLoopBody(
-          builder, loc, xferOp, iv, loopState);
-    });
-
-    return success();
-  }
-};
-
-}  // namespace
-
-namespace mlir {
-
-void populateProgressiveVectorToSCFConversionPatterns(
-    RewritePatternSet &patterns,
-    const ProgressiveVectorTransferToSCFOptions &options) {
-  if (options.unroll) {
-    patterns.add<UnrollTransferReadConversion, UnrollTransferWriteConversion>(
-        patterns.getContext());
-  } else {
-    patterns.add<PrepareTransferReadConversion, PrepareTransferWriteConversion,
-                 TransferOpConversion<TransferReadOp>,
-                 TransferOpConversion<TransferWriteOp>>(patterns.getContext());
-  }
-
-  if (kTargetRank == 1) {
-    patterns.add<TransferOp1dConversion<TransferReadOp>,
-                 TransferOp1dConversion<TransferWriteOp>>(
-        patterns.getContext());
-  }
-}
-
-struct ConvertProgressiveVectorToSCFPass
-    : public ConvertVectorToSCFBase<ConvertProgressiveVectorToSCFPass> {
-  ConvertProgressiveVectorToSCFPass(
-      const ProgressiveVectorTransferToSCFOptions &opt)
-      : options(opt) {}
-
-  void runOnFunction() override {
-    RewritePatternSet patterns(getFunction().getContext());
-    populateProgressiveVectorToSCFConversionPatterns(patterns, options);
-    (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
-  }
-
-  ProgressiveVectorTransferToSCFOptions options;
-};
-
-}  // namespace mlir
-
-std::unique_ptr<Pass> mlir::createProgressiveConvertVectorToSCFPass(
-    const ProgressiveVectorTransferToSCFOptions &options) {
-  return std::make_unique<ConvertProgressiveVectorToSCFPass>(options);
-}

diff  --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 4f13e7d8e5af5..5b5769c9ad066 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -1,4 +1,4 @@
-//===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
+//===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements target-dependent lowering of vector transfer operations.
+// This file implements lowering of vector transfer operations to SCF.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,16 +17,12 @@
 #include "../PassDetail.h"
 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/MemRef/EDSC/Intrinsics.h"
-#include "mlir/Dialect/SCF/EDSC/Builders.h"
 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -37,672 +33,1091 @@ using namespace mlir::edsc::intrinsics;
 using vector::TransferReadOp;
 using vector::TransferWriteOp;
 
-// Return a list of Values that correspond to multiple AffineApplyOp, one for
-// each result of `map`. Each `expr` in `map` is canonicalized and folded
-// greedily according to its operands.
-// TODO: factor out in a common location that both linalg and vector can use.
-static SmallVector<Value, 4>
-applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) {
-  SmallVector<Value, 4> res;
-  res.reserve(map.getNumResults());
-  unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols();
-  // For each `expr` in `map`, applies the `expr` to the values extracted from
-  // ranges. If the resulting application can be folded into a Value, the
-  // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
-  for (auto expr : map.getResults()) {
-    AffineMap map = AffineMap::get(numDims, numSym, expr);
-    SmallVector<Value, 4> operands(values.begin(), values.end());
-    fullyComposeAffineMapAndOperands(&map, &operands);
-    canonicalizeMapAndOperands(&map, &operands);
-    res.push_back(b.createOrFold<AffineApplyOp>(loc, map, operands));
-  }
-  return res;
+namespace {
+
+/// Attribute name used for labeling transfer ops during progressive lowering.
+static const char kPassLabel[] = "__vector_to_scf_lowering__";
+
+/// Lower to 1D transfer ops. Target-specific lowering will lower those.
+static const int64_t kTargetRank = 1;
+
+/// Given a MemRefType with VectorType element type, unpack one dimension from
+/// the VectorType into the MemRefType.
+///
+/// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
+static MemRefType unpackOneDim(MemRefType type) {
+  auto vectorType = type.getElementType().dyn_cast<VectorType>();
+  auto memrefShape = type.getShape();
+  SmallVector<int64_t, 8> newMemrefShape;
+  newMemrefShape.append(memrefShape.begin(), memrefShape.end());
+  newMemrefShape.push_back(vectorType.getDimSize(0));
+  return MemRefType::get(newMemrefShape,
+                         VectorType::get(vectorType.getShape().drop_front(),
+                                         vectorType.getElementType()));
 }
 
-namespace {
-/// Helper class captures the common information needed to lower N>1-D vector
-/// transfer operations (read and write).
-/// On construction, this class opens an edsc::ScopedContext for simpler IR
-/// manipulation.
-/// In pseudo-IR, for an n-D vector_transfer_read such as:
+/// Helper data structure for data and mask buffers.
+struct BufferAllocs {
+  Value dataBuffer;
+  Value maskBuffer;
+};
+
+/// Allocate temporary buffers for data (vector) and mask (if present).
+/// TODO: Parallelism and threadlocal considerations.
+template <typename OpTy>
+static BufferAllocs allocBuffers(OpTy xferOp) {
+  auto &b = ScopedContext::getBuilderRef();
+  OpBuilder::InsertionGuard guard(b);
+  Operation *scope =
+      xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+  assert(scope && "Expected op to be inside automatic allocation scope");
+  b.setInsertionPointToStart(&scope->getRegion(0).front());
+
+  BufferAllocs result;
+  auto bufferType = MemRefType::get({}, xferOp.getVectorType());
+  result.dataBuffer = memref_alloca(bufferType).value;
+
+  if (xferOp.mask()) {
+    auto maskType = MemRefType::get({}, xferOp.mask().getType());
+    Value maskBuffer = memref_alloca(maskType);
+    memref_store(xferOp.mask(), maskBuffer);
+    result.maskBuffer = memref_load(maskBuffer);
+  }
+
+  return result;
+}
+
+/// Given a vector transfer op, calculate which dimension of the `source`
+/// memref should be unpacked in the next application of TransferOpConversion.
+/// A return value of None indicates a broadcast.
+template <typename OpTy>
+static Optional<int64_t> unpackedDim(OpTy xferOp) {
+  auto map = xferOp.permutation_map();
+  if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
+    return expr.getPosition();
+  }
+  assert(xferOp.isBroadcastDim(0) &&
+         "Expected AffineDimExpr or AffineConstantExpr");
+  return None;
+}
+
+/// Compute the permutation map for the new (N-1)-D vector transfer op. This
+/// map is identical to the current permutation map, but the first result is
+/// omitted.
+template <typename OpTy>
+static AffineMap unpackedPermutationMap(OpTy xferOp, OpBuilder &builder) {
+  auto map = xferOp.permutation_map();
+  return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(),
+                        builder.getContext());
+}
+
+/// Calculate the indices for the new vector transfer op.
 ///
+/// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
+///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
+///                                 ^^^^^^
+///              `iv` is the iteration variable of the (new) surrounding loop.
+template <typename OpTy>
+static void getXferIndices(OpTy xferOp, Value iv,
+                           SmallVector<Value, 8> &indices) {
+  typename OpTy::Adaptor adaptor(xferOp);
+  // Corresponding memref dim of the vector dim that is unpacked.
+  auto dim = unpackedDim(xferOp);
+  auto prevIndices = adaptor.indices();
+  indices.append(prevIndices.begin(), prevIndices.end());
+
+  bool isBroadcast = !dim.hasValue();
+  if (!isBroadcast) {
+    using edsc::op::operator+;
+    indices[dim.getValue()] = adaptor.indices()[dim.getValue()] + iv;
+  }
+}
+
+static void maybeYieldValue(bool hasRetVal, OpBuilder builder, Location loc,
+                            Value value) {
+  if (hasRetVal) {
+    builder.create<scf::YieldOp>(loc, value);
+  } else {
+    builder.create<scf::YieldOp>(loc);
+  }
+}
+
+/// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
+/// is set to true. No such check is generated under following circumstances:
+/// * xferOp does not have a mask.
+/// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
+///   computed and attached to the new transfer op in the pattern.)
+/// * The to-be-unpacked dim of xferOp is a broadcast.
+template <typename OpTy>
+static Value generateMaskCheck(OpBuilder &builder, OpTy xferOp, Value iv) {
+  if (!xferOp.mask())
+    return Value();
+  if (xferOp.getMaskType().getRank() != 1)
+    return Value();
+  if (xferOp.isBroadcastDim(0))
+    return Value();
+
+  auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv);
+  return vector_extract_element(xferOp.mask(), ivI32).value;
+}
+
+/// Helper function TransferOpConversion and TransferOp1dConversion.
+/// Generate an in-bounds check if the transfer op may go out-of-bounds on the
+/// specified dimension `dim` with the loop iteration variable `iv`.
+/// E.g., when unpacking dimension 0 from:
 /// ```
-///   vector_transfer_read(%m, %offsets, identity_map, %fill) :
-///     memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
-///     vector<(major_dims) x (minor_dims) x type>
+/// %vec = vector.transfer_read %A[%a, %b] %cst
+///     : vector<5x4xf32>, memref<?x?xf32>
 /// ```
-///
-/// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
-/// higher).
-///
-/// This is the entry point to emitting pseudo-IR resembling:
-///
+/// An if check similar to this will be generated inside the loop:
 /// ```
-///   %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
-///   for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
-///     if (any_of(%ivs_major + %offsets, <, major_dims)) {
-///       %v = vector_transfer_read(
-///         {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
-///          %ivs_minor):
-///         memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
-///         vector<(minor_dims) x type>;
-///       store(%v, %tmp);
-///     } else {
-///       %v = splat(vector<(minor_dims) x type>, %fill)
-///       store(%v, %tmp, %ivs_major);
-///     }
-///   }
-///   %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
-//      vector<(major_dims) x (minor_dims) x type>
+/// %d = memref.dim %A, %c0 : memref<?x?xf32>
+/// if (%a + iv < %d) {
+///   (in-bounds case)
+/// } else {
+///   (out-of-bounds case)
+/// }
 /// ```
 ///
-template <typename ConcreteOp>
-class NDTransferOpHelper {
-public:
-  NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp,
-                     const VectorTransferToSCFOptions &options)
-      : rewriter(rewriter), options(options), loc(xferOp.getLoc()),
-        scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
-        op(xferOp.getOperation()) {
-    vectorType = xferOp.getVectorType();
-    // TODO: when we go to k > 1-D vectors adapt minorRank.
-    minorRank = 1;
-    majorRank = vectorType.getRank() - minorRank;
-    leadingRank = xferOp.getLeadingShapedRank();
-    majorVectorType =
-        VectorType::get(vectorType.getShape().take_front(majorRank),
-                        vectorType.getElementType());
-    minorVectorType =
-        VectorType::get(vectorType.getShape().take_back(minorRank),
-                        vectorType.getElementType());
-    /// Memref of minor vector type is used for individual transfers.
-    memRefMinorVectorType = MemRefType::get(
-        majorVectorType.getShape(), minorVectorType, {},
-        xferOp.getShapedType().template cast<MemRefType>().getMemorySpace());
-  }
-
-  LogicalResult doReplace();
-
-private:
-  /// Creates the loop nest on the "major" dimensions and calls the
-  /// `loopBodyBuilder` lambda in the context of the loop nest.
-  void
-  emitLoops(llvm::function_ref<void(ValueRange, ValueRange, ValueRange,
-                                    ValueRange, const MemRefBoundsCapture &)>
-                loopBodyBuilder);
-
-  /// Common state to lower vector transfer ops.
-  PatternRewriter &rewriter;
-  const VectorTransferToSCFOptions &options;
-  Location loc;
-  std::unique_ptr<ScopedContext> scope;
-  ConcreteOp xferOp;
-  Operation *op;
-  // A vector transfer copies data between:
-  //   - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
-  //   - vector<(major_dims) x (minor_dims) x type>
-  unsigned minorRank;         // for now always 1
-  unsigned majorRank;         // vector rank - minorRank
-  unsigned leadingRank;       // memref rank - vector rank
-  VectorType vectorType;      // vector<(major_dims) x (minor_dims) x type>
-  VectorType majorVectorType; // vector<(major_dims) x type>
-  VectorType minorVectorType; // vector<(minor_dims) x type>
-  MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
-};
+/// If the transfer is 1D and has a mask, this function generates a more complex
+/// check also accounts for potentially masked out elements.
+///
+/// This function variant returns the value returned by `inBoundsCase` or
+/// `outOfBoundsCase`. The MLIR type of the return value must be specified in
+/// `resultTypes`.
+template <typename OpTy>
+static Value generateInBoundsCheck(
+    OpTy xferOp, Value iv, OpBuilder &builder, Optional<int64_t> dim,
+    TypeRange resultTypes,
+    function_ref<Value(OpBuilder &, Location)> inBoundsCase,
+    function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
+  bool hasRetVal = !resultTypes.empty();
+  Value cond; // Condition to be built...
 
-template <typename ConcreteOp>
-void NDTransferOpHelper<ConcreteOp>::emitLoops(
-    llvm::function_ref<void(ValueRange, ValueRange, ValueRange, ValueRange,
-                            const MemRefBoundsCapture &)>
-        loopBodyBuilder) {
-  /// Loop nest operates on the major dimensions
-  MemRefBoundsCapture memrefBoundsCapture(xferOp.source());
+  // Condition check 1: Access in-bounds?
+  bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts.
+  if (!xferOp.isDimInBounds(0) && !isBroadcast) {
+    auto memrefDim =
+        memref_dim(xferOp.source(), std_constant_index(dim.getValue()));
+    using edsc::op::operator+;
+    auto memrefIdx = xferOp.indices()[dim.getValue()] + iv;
+    cond = std_cmpi_sgt(memrefDim.value, memrefIdx);
+  }
 
-  if (options.unroll) {
-    auto shape = majorVectorType.getShape();
-    auto strides = computeStrides(shape);
-    unsigned numUnrolledInstances = computeMaxLinearIndex(shape);
-    ValueRange indices(xferOp.indices());
-    for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) {
-      SmallVector<int64_t, 4> offsets = delinearize(strides, idx);
-      SmallVector<Value, 4> offsetValues =
-          llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value {
-            return std_constant_index(off);
-          }));
-      loopBodyBuilder(offsetValues, indices.take_front(leadingRank),
-                      indices.drop_front(leadingRank).take_front(majorRank),
-                      indices.take_back(minorRank), memrefBoundsCapture);
+  // Condition check 2: Masked in?
+  if (auto maskCond = generateMaskCheck(builder, xferOp, iv)) {
+    if (cond) {
+      cond = builder.create<AndOp>(xferOp.getLoc(), cond, maskCond);
+    } else {
+      cond = maskCond;
     }
-  } else {
-    VectorBoundsCapture vectorBoundsCapture(majorVectorType);
-    auto majorLbs = vectorBoundsCapture.getLbs();
-    auto majorUbs = vectorBoundsCapture.getUbs();
-    auto majorSteps = vectorBoundsCapture.getSteps();
-    affineLoopNestBuilder(
-        majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) {
-          ValueRange indices(xferOp.indices());
-          loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
-                          indices.drop_front(leadingRank).take_front(majorRank),
-                          indices.take_back(minorRank), memrefBoundsCapture);
+  }
+
+  // If the condition is non-empty, generate an SCF::IfOp.
+  if (cond) {
+    auto check = builder.create<scf::IfOp>(
+        xferOp.getLoc(), resultTypes, cond,
+        /*thenBuilder=*/
+        [&](OpBuilder &builder, Location loc) {
+          maybeYieldValue(hasRetVal, builder, loc, inBoundsCase(builder, loc));
+        },
+        /*elseBuilder=*/
+        [&](OpBuilder &builder, Location loc) {
+          if (outOfBoundsCase) {
+            maybeYieldValue(hasRetVal, builder, loc,
+                            outOfBoundsCase(builder, loc));
+          } else {
+            builder.create<scf::YieldOp>(loc);
+          }
         });
+
+    return hasRetVal ? check.getResult(0) : Value();
   }
+
+  // Condition is empty, no need for an SCF::IfOp.
+  return inBoundsCase(builder, xferOp.getLoc());
 }
 
-static Optional<int64_t> extractConstantIndex(Value v) {
-  if (auto cstOp = v.getDefiningOp<ConstantIndexOp>())
-    return cstOp.getValue();
-  if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>())
-    if (affineApplyOp.getAffineMap().isSingleConstant())
-      return affineApplyOp.getAffineMap().getSingleConstantResult();
-  return None;
+/// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
+/// a return value. Consequently, this function does not have a return value.
+template <typename OpTy>
+static void generateInBoundsCheck(
+    OpTy xferOp, Value iv, OpBuilder &builder, Optional<int64_t> dim,
+    function_ref<void(OpBuilder &, Location)> inBoundsCase,
+    function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
+  generateInBoundsCheck(
+      xferOp, iv, builder, dim, /*resultTypes=*/TypeRange(),
+      /*inBoundsCase=*/
+      [&](OpBuilder &builder, Location loc) {
+        inBoundsCase(builder, loc);
+        return Value();
+      },
+      /*outOfBoundsCase=*/
+      [&](OpBuilder &builder, Location loc) {
+        if (outOfBoundsCase)
+          outOfBoundsCase(builder, loc);
+        return Value();
+      });
 }
 
-// Missing foldings of scf.if make it necessary to perform poor man's folding
-// eagerly, especially in the case of unrolling. In the future, this should go
-// away once scf.if folds properly.
-static Value onTheFlyFoldSLT(Value v, Value ub) {
-  using namespace mlir::edsc::op;
-  auto maybeCstV = extractConstantIndex(v);
-  auto maybeCstUb = extractConstantIndex(ub);
-  if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb)
-    return Value();
-  return slt(v, ub);
+/// Given an ArrayAttr, return a copy where the first element is dropped.
+static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) {
+  if (!attr)
+    return attr;
+  return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front());
 }
 
-///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
-///      `majorIvsPlusOffsets`.
-///   2. Return a value of i1 that determines whether the first
-///   `majorIvs.rank()`
-///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
-static Value
-emitInBoundsCondition(PatternRewriter &rewriter,
-                      VectorTransferOpInterface xferOp, unsigned leadingRank,
-                      ValueRange majorIvs, ValueRange majorOffsets,
-                      const MemRefBoundsCapture &memrefBounds,
-                      SmallVectorImpl<Value> &majorIvsPlusOffsets) {
-  Value inBoundsCondition;
-  majorIvsPlusOffsets.reserve(majorIvs.size());
-  unsigned idx = 0;
-  SmallVector<Value, 4> bounds =
-      applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(),
-                       memrefBounds.getUbs());
-  for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) {
-    Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
-    using namespace mlir::edsc::op;
-    majorIvsPlusOffsets.push_back(iv + off);
-    auto affineConstExpr =
-        xferOp.permutation_map().getResult(idx).dyn_cast<AffineConstantExpr>();
-    bool isBroadcast = affineConstExpr && affineConstExpr.getValue() == 0;
-    if (!xferOp.isDimInBounds(leadingRank + idx) && !isBroadcast) {
-      Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub);
-      if (inBoundsCond)
-        inBoundsCondition = (inBoundsCondition)
-                                ? (inBoundsCondition && inBoundsCond)
-                                : inBoundsCond;
-    }
-    ++idx;
-  }
-  return inBoundsCondition;
+/// Add the pass label to a vector transfer op if its rank is not the target
+/// rank.
+template <typename OpTy>
+static void maybeApplyPassLabel(OpBuilder &builder, OpTy newXferOp) {
+  if (newXferOp.getVectorType().getRank() > kTargetRank)
+    newXferOp->setAttr(kPassLabel, builder.getUnitAttr());
 }
 
-// TODO: Parallelism and threadlocal considerations.
-static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
-                                     Operation *op) {
-  auto &b = ScopedContext::getBuilderRef();
-  OpBuilder::InsertionGuard guard(b);
-  Operation *scope =
-      op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
-  assert(scope && "Expected op to be inside automatic allocation scope");
-  b.setInsertionPointToStart(&scope->getRegion(0).front());
-  Value res = memref_alloca(memRefMinorVectorType);
-  return res;
+/// Given a transfer op, find the memref from which the mask is loaded. This
+/// is similar to Strategy<TransferWriteOp>::getBuffer.
+template <typename OpTy>
+static Value getMaskBuffer(OpTy xferOp) {
+  assert(xferOp.mask() && "Expected that transfer op has mask");
+  auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
+  assert(loadOp && "Expected transfer op mask produced by LoadOp");
+  return loadOp.getMemRef();
 }
 
+/// Codegen strategy, depending on the operation.
+template <typename OpTy>
+struct Strategy;
+
+/// Code strategy for vector TransferReadOp.
 template <>
-LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
-  Value alloc, result;
-  if (options.unroll)
-    result = std_splat(vectorType, xferOp.padding());
-  else
-    alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
-
-  emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
-                ValueRange majorOffsets, ValueRange minorOffsets,
-                const MemRefBoundsCapture &memrefBounds) {
-    /// Lambda to load 1-D vector in the current loop ivs + offset context.
-    auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value {
-      SmallVector<Value, 8> indexing;
-      indexing.reserve(leadingRank + majorRank + minorRank);
-      indexing.append(leadingOffsets.begin(), leadingOffsets.end());
-      indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
-      indexing.append(minorOffsets.begin(), minorOffsets.end());
-      Value memref = xferOp.source();
-      auto map =
-          getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType);
-      ArrayAttr inBounds;
-      if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) {
-        OpBuilder &b = ScopedContext::getBuilderRef();
-        inBounds = b.getBoolArrayAttr({true});
-      }
-      return vector_transfer_read(minorVectorType, memref, indexing,
-                                  AffineMapAttr::get(map), xferOp.padding(),
-                                  inBounds);
-    };
-
-    // 1. Compute the inBoundsCondition in the current loops ivs + offset
-    // context.
-    SmallVector<Value, 4> majorIvsPlusOffsets;
-    Value inBoundsCondition = emitInBoundsCondition(
-        rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
-        leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
-
-    if (inBoundsCondition) {
-      // 2. If the condition is not null, we need an IfOp, which may yield
-      // if `options.unroll` is true.
-      SmallVector<Type, 1> resultType;
-      if (options.unroll)
-        resultType.push_back(vectorType);
-
-      // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
-      // splat a 1-D vector.
-      ValueRange ifResults = conditionBuilder(
-          resultType, inBoundsCondition,
-          [&]() -> scf::ValueVector {
-            Value vector = load1DVector(majorIvsPlusOffsets);
-            // 3.a. If `options.unroll` is true, insert the 1-D vector in the
-            // aggregate. We must yield and merge with the `else` branch.
-            if (options.unroll) {
-              vector = vector_insert(vector, result, majorIvs);
-              return {vector};
-            }
-            // 3.b. Otherwise, just go through the temporary `alloc`.
-            memref_store(vector, alloc, majorIvs);
-            return {};
-          },
-          [&]() -> scf::ValueVector {
-            Value vector = std_splat(minorVectorType, xferOp.padding());
-            // 3.c. If `options.unroll` is true, insert the 1-D vector in the
-            // aggregate. We must yield and merge with the `then` branch.
-            if (options.unroll) {
-              vector = vector_insert(vector, result, majorIvs);
-              return {vector};
-            }
-            // 3.d. Otherwise, just go through the temporary `alloc`.
-            memref_store(vector, alloc, majorIvs);
-            return {};
-          });
+struct Strategy<TransferReadOp> {
+  /// Find the StoreOp that is used for writing the current TransferReadOp's
+  /// result to the temporary buffer allocation.
+  static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
+    assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
+    auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner());
+    assert(storeOp && "Expected TransferReadOp result used by StoreOp");
+    return storeOp;
+  }
 
-      if (!resultType.empty())
-        result = *ifResults.begin();
-    } else {
-      // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
-      Value loaded1D = load1DVector(majorIvsPlusOffsets);
-      // 5.a. If `options.unroll` is true, insert the 1-D vector in the
-      // aggregate.
-      if (options.unroll)
-        result = vector_insert(loaded1D, result, majorIvs);
-      // 5.b. Otherwise, just go through the temporary `alloc`.
-      else
-        memref_store(loaded1D, alloc, majorIvs);
-    }
-  });
+  /// Find the temporary buffer allocation. All labeled TransferReadOps are
+  /// used like this, where %buf is either the buffer allocation or a type cast
+  /// of the buffer allocation:
+  /// ```
+  /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
+  /// memref.store %vec, %buf[...] ...
+  /// ```
+  static Value getBuffer(TransferReadOp xferOp) {
+    return getStoreOp(xferOp).getMemRef();
+  }
 
-  assert((!options.unroll ^ (bool)result) &&
-         "Expected resulting Value iff unroll");
-  if (!result)
-    result =
-        memref_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
-  rewriter.replaceOp(op, result);
+  /// Retrieve the indices of the current StoreOp that stores into the buffer.
+  static void getBufferIndices(TransferReadOp xferOp,
+                               SmallVector<Value, 8> &indices) {
+    auto storeOp = getStoreOp(xferOp);
+    auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
+    indices.append(prevIndices.begin(), prevIndices.end());
+  }
 
-  return success();
-}
+  /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
+  /// accesses on the to-be-unpacked dimension.
+  ///
+  /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
+  ///    variable `iv`.
+  /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
+  ///
+  /// E.g.:
+  /// ```
+  /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
+  ///     : memref<?x?x?xf32>, vector<4x3xf32>
+  /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
+  /// ```
+  /// Is rewritten to:
+  /// ```
+  /// %casted = vector.type_cast %buf
+  ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
+  /// for %j = 0 to 4 {
+  ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
+  ///       : memref<?x?x?xf32>, vector<3xf32>
+  ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
+  /// }
+  /// ```
+  ///
+  /// Note: The loop and type cast are generated in TransferOpConversion.
+  ///       The original TransferReadOp and store op are deleted in `cleanup`.
+  /// Note: The `mask` operand is set in TransferOpConversion.
+  static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp,
+                                  Value buffer, Value iv) {
+    SmallVector<Value, 8> storeIndices;
+    getBufferIndices(xferOp, storeIndices);
+    storeIndices.push_back(iv);
+
+    SmallVector<Value, 8> xferIndices;
+    getXferIndices(xferOp, iv, xferIndices);
+
+    auto bufferType = buffer.getType().dyn_cast<ShapedType>();
+    auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
+    auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr());
+    auto newXfer =
+        vector_transfer_read(
+            vecType, xferOp.source(), xferIndices,
+            AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)),
+            xferOp.padding(), Value(), inBoundsAttr)
+            .value;
+
+    maybeApplyPassLabel(builder,
+                        dyn_cast<TransferReadOp>(newXfer.getDefiningOp()));
+
+    memref_store(newXfer, buffer, storeIndices);
+    return newXfer.getDefiningOp<TransferReadOp>();
+  }
+
+  /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
+  /// padding value to the temporary buffer.
+  static void handleOutOfBoundsDim(OpBuilder & /*builder*/,
+                                   TransferReadOp xferOp, Value buffer,
+                                   Value iv) {
+    SmallVector<Value, 8> storeIndices;
+    getBufferIndices(xferOp, storeIndices);
+    storeIndices.push_back(iv);
+
+    auto bufferType = buffer.getType().dyn_cast<ShapedType>();
+    auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
+    auto vec = std_splat(vecType, xferOp.padding());
+    memref_store(vec, buffer, storeIndices);
+  }
+
+  /// Cleanup after rewriting the op.
+  static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) {
+    rewriter.eraseOp(getStoreOp(xferOp));
+    rewriter.eraseOp(xferOp);
+  }
+};
 
+/// Codegen strategy for vector TransferWriteOp.
 template <>
-LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
-  Value alloc;
-  if (!options.unroll) {
-    alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
-    memref_store(xferOp.vector(),
-                 vector_type_cast(MemRefType::get({}, vectorType), alloc));
-  }
-
-  emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
-                ValueRange majorOffsets, ValueRange minorOffsets,
-                const MemRefBoundsCapture &memrefBounds) {
-    // Lower to 1-D vector_transfer_write and let recursion handle it.
-    auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) {
-      SmallVector<Value, 8> indexing;
-      indexing.reserve(leadingRank + majorRank + minorRank);
-      indexing.append(leadingOffsets.begin(), leadingOffsets.end());
-      indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
-      indexing.append(minorOffsets.begin(), minorOffsets.end());
-      Value result;
-      // If `options.unroll` is true, extract the 1-D vector from the
-      // aggregate.
-      if (options.unroll)
-        result = vector_extract(xferOp.vector(), majorIvs);
-      else
-        result = memref_load(alloc, majorIvs);
-      auto map =
-          getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType);
-      ArrayAttr inBounds;
-      if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) {
-        OpBuilder &b = ScopedContext::getBuilderRef();
-        inBounds = b.getBoolArrayAttr({true});
-      }
-      vector_transfer_write(result, xferOp.source(), indexing,
-                            AffineMapAttr::get(map), inBounds);
-    };
-
-    // 1. Compute the inBoundsCondition in the current loops ivs + offset
-    // context.
-    SmallVector<Value, 4> majorIvsPlusOffsets;
-    Value inBoundsCondition = emitInBoundsCondition(
-        rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
-        leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
-
-    if (inBoundsCondition) {
-      // 2.a. If the condition is not null, we need an IfOp, to write
-      // conditionally. Progressively lower to a 1-D transfer write.
-      conditionBuilder(inBoundsCondition,
-                       [&] { emitTransferWrite(majorIvsPlusOffsets); });
-    } else {
-      // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
-      emitTransferWrite(majorIvsPlusOffsets);
-    }
-  });
+struct Strategy<TransferWriteOp> {
+  /// Find the temporary buffer allocation. All labeled TransferWriteOps are
+  /// used like this, where %buf is either the buffer allocation or a type cast
+  /// of the buffer allocation:
+  /// ```
+  /// %vec = memref.load %buf[...] ...
+  /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
+  /// ```
+  static Value getBuffer(TransferWriteOp xferOp) {
+    auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
+    assert(loadOp && "Expected transfer op vector produced by LoadOp");
+    return loadOp.getMemRef();
+  }
+
+  /// Retrieve the indices of the current LoadOp that loads from the buffer.
+  static void getBufferIndices(TransferWriteOp xferOp,
+                               SmallVector<Value, 8> &indices) {
+    auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
+    auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
+    indices.append(prevIndices.begin(), prevIndices.end());
+  }
+
+  /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
+  /// accesses on the to-be-unpacked dimension.
+  ///
+  /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
+  ///    using the loop iteration variable `iv`.
+  /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
+  ///    to memory.
+  ///
+  /// Note: For more details, see comments on Strategy<TransferReadOp>.
+  static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp,
+                                   Value buffer, Value iv) {
+    SmallVector<Value, 8> loadIndices;
+    getBufferIndices(xferOp, loadIndices);
+    loadIndices.push_back(iv);
+
+    SmallVector<Value, 8> xferIndices;
+    getXferIndices(xferOp, iv, xferIndices);
 
-  rewriter.eraseOp(op);
+    auto vec = memref_load(buffer, loadIndices);
+    auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr());
+    auto newXfer = vector_transfer_write(
+        Type(), vec, xferOp.source(), xferIndices,
+        AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)), Value(),
+        inBoundsAttr);
 
+    maybeApplyPassLabel(builder, newXfer.op);
+
+    return newXfer;
+  }
+
+  /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
+  static void handleOutOfBoundsDim(OpBuilder &builder, TransferWriteOp xferOp,
+                                   Value buffer, Value iv) {}
+
+  /// Cleanup after rewriting the op.
+  static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) {
+    rewriter.eraseOp(xferOp);
+  }
+};
+
+template <typename OpTy>
+LogicalResult checkPrepareXferOp(OpTy xferOp) {
+  if (xferOp->hasAttr(kPassLabel))
+    return failure();
+  if (xferOp.getVectorType().getRank() <= kTargetRank)
+    return failure();
   return success();
 }
 
-} // namespace
+/// Prepare a TransferReadOp for progressive lowering.
+///
+/// 1. Allocate a temporary buffer.
+/// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
+/// 3. Store the result of the TransferReadOp into the temporary buffer.
+/// 4. Load the result from the temporary buffer and replace all uses of the
+///    original TransferReadOp with this load.
+///
+/// E.g.:
+/// ```
+/// %vec = vector.transfer_read %A[%a, %b, %c], %cst
+///     : vector<5x4xf32>, memref<?x?x?xf32>
+/// ```
+/// is rewritten to:
+/// ```
+/// %0 = memref.alloca() : memref<vector<5x4xf32>>
+/// %1 = vector.transfer_read %A[%a, %b, %c], %cst
+///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
+/// memref.store %1, %0[] : memref<vector<5x4xf32>>
+/// %vec = memref.load %0[] : memref<vector<5x4xf32>>
+/// ```
+///
+/// Note: A second temporary buffer may be allocated for the `mask` operand.
+struct PrepareTransferReadConversion : public OpRewritePattern<TransferReadOp> {
+  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
 
-/// Analyzes the `transfer` to find an access dimension along the fastest remote
-/// MemRef dimension. If such a dimension with coalescing properties is found,
-/// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
-/// LoopNestBuilder captures it in the innermost loop.
-template <typename TransferOpTy>
-static int computeCoalescedIndex(TransferOpTy transfer) {
-  // rank of the remote memory access, coalescing behavior occurs on the
-  // innermost memory dimension.
-  auto remoteRank = transfer.getShapedType().getRank();
-  // Iterate over the results expressions of the permutation map to determine
-  // the loop order for creating pointwise copies between remote and local
-  // memories.
-  int coalescedIdx = -1;
-  auto exprs = transfer.permutation_map().getResults();
-  for (auto en : llvm::enumerate(exprs)) {
-    auto dim = en.value().template dyn_cast<AffineDimExpr>();
-    if (!dim) {
-      continue;
+  LogicalResult matchAndRewrite(TransferReadOp xferOp,
+                                PatternRewriter &rewriter) const override {
+    if (checkPrepareXferOp(xferOp).failed())
+      return failure();
+
+    ScopedContext scope(rewriter, xferOp.getLoc());
+    auto buffers = allocBuffers(xferOp);
+    auto *newXfer = rewriter.clone(*xferOp.getOperation());
+    newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
+    if (xferOp.mask()) {
+      dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(
+          buffers.maskBuffer);
     }
-    auto memRefDim = dim.getPosition();
-    if (memRefDim == remoteRank - 1) {
-      // memRefDim has coalescing properties, it should be swapped in the last
-      // position.
-      assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
-      coalescedIdx = en.index();
+
+    memref_store(newXfer->getResult(0), buffers.dataBuffer);
+    rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
+
+    return success();
+  }
+};
+
+/// Prepare a TransferWriteOp for progressive lowering.
+///
+/// 1. Allocate a temporary buffer.
+/// 2. Store the vector into the buffer.
+/// 3. Load the vector from the buffer again.
+/// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
+///    marking it eligible for progressive lowering via TransferOpConversion.
+///
+/// E.g.:
+/// ```
+/// vector.transfer_write %vec, %A[%a, %b, %c]
+///     : vector<5x4xf32>, memref<?x?x?xf32>
+/// ```
+/// is rewritten to:
+/// ```
+/// %0 = memref.alloca() : memref<vector<5x4xf32>>
+/// memref.store %vec, %0[] : memref<vector<5x4xf32>>
+/// %1 = memref.load %0[] : memref<vector<5x4xf32>>
+/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
+///     : vector<5x4xf32>, memref<?x?x?xf32>
+/// ```
+///
+/// Note: A second temporary buffer may be allocated for the `mask` operand.
+struct PrepareTransferWriteConversion
+    : public OpRewritePattern<TransferWriteOp> {
+  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TransferWriteOp xferOp,
+                                PatternRewriter &rewriter) const override {
+    if (checkPrepareXferOp(xferOp).failed())
+      return failure();
+
+    ScopedContext scope(rewriter, xferOp.getLoc());
+    auto buffers = allocBuffers(xferOp);
+    memref_store(xferOp.vector(), buffers.dataBuffer);
+    auto loadedVec = memref_load(buffers.dataBuffer);
+    rewriter.updateRootInPlace(xferOp, [&]() {
+      xferOp.vectorMutable().assign(loadedVec);
+      xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
+    });
+
+    if (xferOp.mask()) {
+      rewriter.updateRootInPlace(
+          xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); });
     }
+
+    return success();
   }
-  return coalescedIdx;
-}
+};
 
-template <typename TransferOpTy>
-VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
-    VectorTransferToSCFOptions options, MLIRContext *context)
-    : RewritePattern(TransferOpTy::getOperationName(), 1, context),
-      options(options) {}
-
-/// Used for staging the transfer in a local buffer.
-template <typename TransferOpTy>
-MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
-    TransferOpTy transfer) const {
-  auto vectorType = transfer.getVectorType();
-  return MemRefType::get(vectorType.getShape().drop_back(),
-                         VectorType::get(vectorType.getShape().take_back(),
-                                         vectorType.getElementType()),
-                         {}, 0);
-}
+/// Progressive lowering of vector transfer ops: Unpack one dimension.
+///
+/// 1. Unpack one dimension from the current buffer type and cast the buffer
+///    to that new type. E.g.:
+///    ```
+///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
+///    vector.transfer_write %vec ...
+///    ```
+///    The following cast is generated:
+///    ```
+///    %casted = vector.type_cast %0
+///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
+///    ```
+/// 2. Generate a for loop and rewrite the transfer op according to the
+///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
+///    out-of-bounds, generate an if-check and handle both cases separately.
+/// 3. Clean up according to the corresponding Strategy<OpTy>.
+template <typename OpTy>
+struct TransferOpConversion : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy xferOp,
+                                PatternRewriter &rewriter) const override {
+    if (!xferOp->hasAttr(kPassLabel))
+      return failure();
+
+    ScopedContext scope(rewriter, xferOp.getLoc());
+
+    // Find and cast data buffer. How the buffer can be found depends on OpTy.
+    auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
+    auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
+    auto castedDataType = unpackOneDim(dataBufferType);
+    auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer);
 
-static void emitWithBoundsChecks(
-    PatternRewriter &rewriter, VectorTransferOpInterface transfer,
-    ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture,
-    function_ref<void(ArrayRef<Value>)> inBoundsFun,
-    function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) {
-  // Permute the incoming indices according to the permutation map.
-  SmallVector<Value, 4> indices =
-      applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(),
-                       transfer.indices());
-
-  // Generate a bounds check if necessary.
-  SmallVector<Value, 4> majorIvsPlusOffsets;
-  Value inBoundsCondition =
-      emitInBoundsCondition(rewriter, transfer, 0, ivs, indices,
-                            memRefBoundsCapture, majorIvsPlusOffsets);
-
-  // Apply the permutation map to the ivs. The permutation map may not use all
-  // the inputs.
-  SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size());
-  for (unsigned memRefDim = 0; memRefDim < transfer.indices().size();
-       ++memRefDim) {
-    // Linear search on a small number of entries.
-    int loopIndex = -1;
-    auto exprs = transfer.permutation_map().getResults();
-    for (auto en : llvm::enumerate(exprs)) {
-      auto expr = en.value();
-      auto dim = expr.dyn_cast<AffineDimExpr>();
-      // Sanity check.
-      assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) &&
-             "Expected dim or 0 in permutationMap");
-      if (dim && memRefDim == dim.getPosition()) {
-        loopIndex = en.index();
-        break;
+    // If the xferOp has a mask: Find and cast mask buffer.
+    Value castedMaskBuffer;
+    if (xferOp.mask()) {
+      auto maskBuffer = getMaskBuffer(xferOp);
+      auto maskBufferType =
+          maskBuffer.getType().template dyn_cast<MemRefType>();
+      if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
+        // Do not unpack a dimension of the mask, if:
+        // * To-be-unpacked transfer op dimension is a broadcast.
+        // * Mask is 1D, i.e., the mask cannot be further unpacked.
+        //   (That means that all remaining dimensions of the transfer op must
+        //   be broadcasted.)
+        castedMaskBuffer = maskBuffer;
+      } else {
+        auto castedMaskType = unpackOneDim(maskBufferType);
+        castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer);
       }
     }
 
-    using namespace edsc::op;
-    auto i = transfer.indices()[memRefDim];
-    scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex];
-  }
-
-  if (inBoundsCondition)
-    conditionBuilder(
-        /* scf.if */ inBoundsCondition, // {
-        [&] { inBoundsFun(scalarAccessExprs); },
-        // } else {
-        outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); }
-                       : function_ref<void()>()
-        // }
-    );
-  else
-    inBoundsFun(scalarAccessExprs);
+    // Loop bounds and step.
+    auto lb = std_constant_index(0).value;
+    auto ub = std_constant_index(
+                  castedDataType.getDimSize(castedDataType.getRank() - 1))
+                  .value;
+    auto step = std_constant_index(1).value;
+
+    // Generate for loop.
+    rewriter.create<scf::ForOp>(
+        xferOp.getLoc(), lb, ub, step, ValueRange(),
+        [&](OpBuilder &b, Location loc, Value iv, ValueRange /*loopState*/) {
+          ScopedContext scope(b, loc);
+          generateInBoundsCheck(
+              xferOp, iv, b, unpackedDim(xferOp),
+              /*inBoundsCase=*/
+              [&](OpBuilder &b, Location /*loc*/) {
+                // Create new transfer op.
+                OpTy newXfer =
+                    Strategy<OpTy>::rewriteOp(b, xferOp, castedDataBuffer, iv);
+
+                // If old transfer op has a mask: Set mask on new transfer op.
+                // Special case: If the mask of the old transfer op is 1D and
+                // the
+                //               unpacked dim is not a broadcast, no mask is
+                //               needed on the new transfer op.
+                if (xferOp.mask() && (xferOp.isBroadcastDim(0) ||
+                                      xferOp.getMaskType().getRank() > 1)) {
+                  OpBuilder::InsertionGuard guard(b);
+                  b.setInsertionPoint(newXfer); // Insert load before newXfer.
+
+                  SmallVector<Value, 8> loadIndices;
+                  Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
+                  // In case of broadcast: Use same indices to load from memref
+                  // as before.
+                  if (!xferOp.isBroadcastDim(0))
+                    loadIndices.push_back(iv);
+
+                  auto mask = memref_load(castedMaskBuffer, loadIndices);
+                  rewriter.updateRootInPlace(
+                      newXfer, [&]() { newXfer.maskMutable().assign(mask); });
+                }
+              },
+              /*outOfBoundsCase=*/
+              [&](OpBuilder &b, Location /*loc*/) {
+                Strategy<OpTy>::handleOutOfBoundsDim(b, xferOp,
+                                                     castedDataBuffer, iv);
+              });
+          b.create<scf::YieldOp>(loc);
+        });
+
+    Strategy<OpTy>::cleanup(rewriter, xferOp);
+    return success();
+  }
+};
+
+/// If the original transfer op has a mask, compute the mask of the new transfer
+/// op (for the current iteration `i`) and assign it.
+template <typename OpTy>
+static void maybeAssignMask(OpBuilder &builder, OpTy xferOp, OpTy newXferOp,
+                            int64_t i) {
+  if (!xferOp.mask())
+    return;
+
+  if (xferOp.isBroadcastDim(0)) {
+    // To-be-unpacked dimension is a broadcast, which does not have a
+    // corresponding mask dimension. Mask attribute remains unchanged.
+    newXferOp.maskMutable().assign(xferOp.mask());
+    return;
+  }
+
+  if (xferOp.getMaskType().getRank() > 1) {
+    // Unpack one dimension of the mask.
+    OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPoint(newXferOp); // Insert load before newXfer.
+
+    llvm::SmallVector<int64_t, 1> indices({i});
+    auto newMask = vector_extract(xferOp.mask(), indices).value;
+    newXferOp.maskMutable().assign(newMask);
+  }
+
+  // If we end up here: The mask of the old transfer op is 1D and the unpacked
+  // dim is not a broadcast, so no mask is needed on the new transfer op.
+  // `generateInBoundsCheck` will have evaluated the mask already.
 }
 
-namespace mlir {
+/// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
+/// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
+/// memref buffer is allocated and the SCF loop is fully unrolled.
+///
+/// ```
+/// E.g.:
+/// ```
+/// %vec = vector.transfer_read %A[%a, %b, %c], %padding
+///     : memref<?x?x?xf32>, vector<5x4xf32>
+/// ```
+/// is rewritten to IR such as (simplified):
+/// ```
+/// %v_init = splat %padding : vector<5x4xf32>
+/// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
+///     : memref<?x?x?xf32>, vector<4xf32>
+/// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
+/// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
+///     : memref<?x?x?xf32>, vector<4xf32>
+/// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
+/// ...
+/// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
+///     : memref<?x?x?xf32>, vector<4xf32>
+/// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
+/// ```
+///
+/// Note: As an optimization, if the result of the original TransferReadOp
+/// was directly inserted into another vector, no new %v_init vector is created.
+/// Instead, the new TransferReadOp results are inserted into that vector.
+struct UnrollTransferReadConversion : public OpRewritePattern<TransferReadOp> {
+  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
+
+  /// Return the vector into which the newly created TransferReadOp results
+  /// are inserted.
+  Value getResultVector(TransferReadOp xferOp,
+                        PatternRewriter &rewriter) const {
+    if (auto insertOp = getInsertOp(xferOp))
+      return insertOp.dest();
+    return std_splat(xferOp.getVectorType(), xferOp.padding()).value;
+  }
+
+  /// If the result of the TransferReadOp has exactly one user, which is a
+  /// vector::InsertOp, return that operation.
+  vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
+    if (xferOp->hasOneUse()) {
+      Operation *xferOpUser = *xferOp->getUsers().begin();
+      if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
+        return insertOp;
+    }
+
+    return vector::InsertOp();
+  }
+
+  /// If the result of the TransferReadOp has exactly one user, which is a
+  /// vector::InsertOp, return that operation's indices.
+  void getInsertionIndices(TransferReadOp xferOp,
+                           SmallVector<int64_t, 8> &indices) const {
+    if (auto insertOp = getInsertOp(xferOp)) {
+      llvm::for_each(insertOp.position(), [&](Attribute attr) {
+        indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
+      });
+    }
+  }
+
+  /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
+  /// accesses, and broadcasts and transposes in permutation maps.
+  LogicalResult matchAndRewrite(TransferReadOp xferOp,
+                                PatternRewriter &rewriter) const override {
+    if (xferOp.getVectorType().getRank() <= kTargetRank)
+      return failure();
+
+    ScopedContext scope(rewriter, xferOp.getLoc());
+    auto insertOp = getInsertOp(xferOp);
+    auto vec = getResultVector(xferOp, rewriter);
+    auto vecType = vec.getType().dyn_cast<VectorType>();
+    auto xferVecType = xferOp.getVectorType();
+    auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(),
+                                          xferVecType.getElementType());
+    int64_t dimSize = xferVecType.getShape()[0];
+
+    // Generate fully unrolled loop of transfer ops.
+    for (int64_t i = 0; i < dimSize; ++i) {
+      Value iv = std_constant_index(i);
+
+      vec = generateInBoundsCheck(
+          xferOp, iv, rewriter, unpackedDim(xferOp), TypeRange(vecType),
+          /*inBoundsCase=*/
+          [&](OpBuilder &b, Location loc) {
+            ScopedContext scope(b, loc);
+
+            // Indices for the new transfer op.
+            SmallVector<Value, 8> xferIndices;
+            getXferIndices(xferOp, iv, xferIndices);
+
+            // Indices for the new vector.insert op.
+            SmallVector<int64_t, 8> insertionIndices;
+            getInsertionIndices(xferOp, insertionIndices);
+            insertionIndices.push_back(i);
 
-/// Lowers TransferReadOp into a combination of:
-///   1. local memory allocation;
-///   2. perfect loop nest over:
-///      a. scalar load from local buffers (viewed as a scalar memref);
-///      a. scalar store to original memref (with padding).
-///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
-///   4. local memory deallocation.
+            auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
+            auto newXferOpVal =
+                vector_transfer_read(
+                    newXferVecType, xferOp.source(), xferIndices,
+                    AffineMapAttr::get(unpackedPermutationMap(xferOp, b)),
+                    xferOp.padding(), Value(), inBoundsAttr)
+                    .value;
+            auto newXferOp =
+                dyn_cast<TransferReadOp>(newXferOpVal.getDefiningOp());
+
+            maybeAssignMask(b, xferOp, newXferOp, i);
+
+            return vector_insert(newXferOp, vec, insertionIndices).value;
+          },
+          /*outOfBoundsCase=*/
+          [&](OpBuilder &b, Location loc) {
+            // Loop through original (unmodified) vector.
+            return vec;
+          });
+    }
+
+    if (insertOp) {
+      // Rewrite single user of the old TransferReadOp, which was an InsertOp.
+      rewriter.replaceOp(insertOp, vec);
+      rewriter.eraseOp(xferOp);
+    } else {
+      rewriter.replaceOp(xferOp, vec);
+    }
+
+    return success();
+  }
+};
+
+/// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
+/// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
+/// memref buffer is allocated and the SCF loop is fully unrolled.
+///
+/// ```
+/// E.g.:
+/// ```
+/// vector.transfer_write %vec, %A[%a, %b, %c]
+///     : vector<5x4xf32>, memref<?x?x?xf32>
+/// ```
+/// is rewritten to IR such as (simplified):
+/// ```
+/// %v0 = vector.extract %vec[0] : vector<5x4xf32>
+/// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
+/// %v1 = vector.extract %vec[1] : vector<5x4xf32>
+/// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
+/// ...
+/// %v4 = vector.extract %vec[4] : vector<5x4xf32>
+/// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
+/// ```
 ///
-/// Lowers the data transfer part of a TransferReadOp while ensuring no
-/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
-/// padding.
+/// Note: As an optimization, if the vector of the original TransferWriteOp
+/// was directly extracted from another vector via an ExtractOp `a`, extract
+/// the vectors for the newly generated TransferWriteOps from `a`'s input. By
+/// doing so, `a` may become dead, and the number of ExtractOps generated during
+/// recursive application of this pattern will be minimal.
+struct UnrollTransferWriteConversion
+    : public OpRewritePattern<TransferWriteOp> {
+  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
 
-/// Performs the rewrite.
-template <>
-LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
-    Operation *op, PatternRewriter &rewriter) const {
-  using namespace mlir::edsc::op;
+  /// Return the vector from which newly generated ExtracOps will extract.
+  Value getDataVector(TransferWriteOp xferOp) const {
+    if (auto extractOp = getExtractOp(xferOp))
+      return extractOp.vector();
+    return xferOp.vector();
+  }
 
-  TransferReadOp transfer = cast<TransferReadOp>(op);
-  if (transfer.mask())
-    return failure();
-  auto memRefType = transfer.getShapedType().dyn_cast<MemRefType>();
-  if (!memRefType)
-    return failure();
-  // Fall back to a loop if the fastest varying stride is not 1 or it is
-  // permuted.
-  int64_t offset;
-  SmallVector<int64_t, 4> strides;
-  auto successStrides = getStridesAndOffset(memRefType, strides, offset);
-  if (succeeded(successStrides) && strides.back() == 1 &&
-      transfer.permutation_map().isMinorIdentity()) {
-    // If > 1D, emit a bunch of loops around 1-D vector transfers.
-    if (transfer.getVectorType().getRank() > 1)
-      return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options)
-          .doReplace();
-    // If 1-D this is now handled by the target-specific lowering.
-    if (transfer.getVectorType().getRank() == 1)
+  /// If the input of the given TransferWriteOp is an ExtractOp, return it.
+  vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
+    if (auto *op = xferOp.vector().getDefiningOp())
+      return dyn_cast<vector::ExtractOp>(op);
+    return vector::ExtractOp();
+  }
+
+  /// If the input of the given TransferWriteOp is an ExtractOp, return its
+  /// indices.
+  void getExtractionIndices(TransferWriteOp xferOp,
+                            SmallVector<int64_t, 8> &indices) const {
+    if (auto extractOp = getExtractOp(xferOp)) {
+      llvm::for_each(extractOp.position(), [&](Attribute attr) {
+        indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
+      });
+    }
+  }
+
+  /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
+  /// accesses, and broadcasts and transposes in permutation maps.
+  LogicalResult matchAndRewrite(TransferWriteOp xferOp,
+                                PatternRewriter &rewriter) const override {
+    if (xferOp.getVectorType().getRank() <= kTargetRank)
       return failure();
+
+    ScopedContext scope(rewriter, xferOp.getLoc());
+    auto vec = getDataVector(xferOp);
+    auto xferVecType = xferOp.getVectorType();
+    int64_t dimSize = xferVecType.getShape()[0];
+
+    // Generate fully unrolled loop of transfer ops.
+    for (int64_t i = 0; i < dimSize; ++i) {
+      Value iv = std_constant_index(i);
+
+      generateInBoundsCheck(
+          xferOp, iv, rewriter, unpackedDim(xferOp),
+          /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
+            ScopedContext scope(b, loc);
+
+            // Indices for the new transfer op.
+            SmallVector<Value, 8> xferIndices;
+            getXferIndices(xferOp, iv, xferIndices);
+
+            // Indices for the new vector.extract op.
+            SmallVector<int64_t, 8> extractionIndices;
+            getExtractionIndices(xferOp, extractionIndices);
+            extractionIndices.push_back(i);
+
+            auto extracted = vector_extract(vec, extractionIndices).value;
+            auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
+
+            auto newXferOp =
+                vector_transfer_write(
+                    Type(), extracted, xferOp.source(), xferIndices,
+                    AffineMapAttr::get(unpackedPermutationMap(xferOp, b)),
+                    Value(), inBoundsAttr)
+                    .op;
+
+            maybeAssignMask(b, xferOp, newXferOp, i);
+          });
+    }
+
+    rewriter.eraseOp(xferOp);
+    return success();
   }
+};
 
-  // Conservative lowering to scalar load / stores.
-  // 1. Setup all the captures.
-  ScopedContext scope(rewriter, transfer.getLoc());
-  MemRefIndexedValue remote(transfer.source());
-  MemRefBoundsCapture memRefBoundsCapture(transfer.source());
-  VectorBoundsCapture vectorBoundsCapture(transfer.vector());
-  int coalescedIdx = computeCoalescedIndex(transfer);
-  // Swap the vectorBoundsCapture which will reorder loop bounds.
-  if (coalescedIdx >= 0)
-    vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
-                                   coalescedIdx);
-
-  auto lbs = vectorBoundsCapture.getLbs();
-  auto ubs = vectorBoundsCapture.getUbs();
-  SmallVector<Value, 8> steps;
-  steps.reserve(vectorBoundsCapture.getSteps().size());
-  for (auto step : vectorBoundsCapture.getSteps())
-    steps.push_back(std_constant_index(step));
-
-  // 2. Emit alloc-copy-load-dealloc.
-  MLIRContext *ctx = op->getContext();
-  Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
-  MemRefIndexedValue local(tmp);
-  loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
-    auto ivsStorage = llvm::to_vector<8>(loopIvs);
-    // Swap the ivs which will reorder memory accesses.
-    if (coalescedIdx >= 0)
-      std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
-
-    ArrayRef<Value> ivs(ivsStorage);
-    Value pos = std_index_cast(IntegerType::get(ctx, 32), ivs.back());
-    Value inVector = local(ivs.drop_back());
-    auto loadValue = [&](ArrayRef<Value> indices) {
-      Value vector = vector_insert_element(remote(indices), inVector, pos);
-      local(ivs.drop_back()) = vector;
-    };
-    auto loadPadding = [&](ArrayRef<Value>) {
-      Value vector = vector_insert_element(transfer.padding(), inVector, pos);
-      local(ivs.drop_back()) = vector;
-    };
-    emitWithBoundsChecks(
-        rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
-        memRefBoundsCapture, loadValue, loadPadding);
-  });
-  Value vectorValue = memref_load(vector_type_cast(tmp));
-
-  // 3. Propagate.
-  rewriter.replaceOp(op, vectorValue);
-  return success();
+/// Compute the indices into the memref for the LoadOp/StoreOp generated as
+/// part of TransferOp1dConversion. Return the memref dimension on which
+/// the transfer is operating. A return value of None indicates a broadcast.
+template <typename OpTy>
+static Optional<int64_t>
+get1dMemrefIndices(OpTy xferOp, Value iv,
+                   SmallVector<Value, 8> &memrefIndices) {
+  auto indices = xferOp.indices();
+  auto map = xferOp.permutation_map();
+
+  memrefIndices.append(indices.begin(), indices.end());
+  assert(map.getNumResults() == 1 &&
+         "Expected 1 permutation map result for 1D transfer");
+  if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
+    auto dim = expr.getPosition();
+    using edsc::op::operator+;
+    memrefIndices[dim] = memrefIndices[dim] + iv;
+    return dim;
+  }
+
+  assert(xferOp.isBroadcastDim(0) &&
+         "Expected AffineDimExpr or AffineConstantExpr");
+  return None;
 }
 
-/// Lowers TransferWriteOp into a combination of:
-///   1. local memory allocation;
-///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
-///   3. perfect loop nest over:
-///      a. scalar load from local buffers (viewed as a scalar memref);
-///      a. scalar store to original memref (if in bounds).
-///   4. local memory deallocation.
-///
-/// More specifically, lowers the data transfer part while ensuring no
-/// out-of-bounds accesses are possible.
+/// Codegen strategy for TransferOp1dConversion, depending on the
+/// operation.
+template <typename OpTy>
+struct Strategy1d;
+
+/// Codegen strategy for TransferReadOp.
 template <>
-LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
-    Operation *op, PatternRewriter &rewriter) const {
-  using namespace edsc::op;
+struct Strategy1d<TransferReadOp> {
+  static void generateForLoopBody(OpBuilder &builder, Location loc,
+                                  TransferReadOp xferOp, Value iv,
+                                  ValueRange loopState) {
+    SmallVector<Value, 8> indices;
+    auto dim = get1dMemrefIndices(xferOp, iv, indices);
+    auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv);
+    auto vec = loopState[0];
 
-  TransferWriteOp transfer = cast<TransferWriteOp>(op);
-  if (transfer.mask())
-    return failure();
-  auto memRefType = transfer.getShapedType().template dyn_cast<MemRefType>();
-  if (!memRefType)
-    return failure();
+    // In case of out-of-bounds access, leave `vec` as is (was initialized with
+    // padding value).
+    auto nextVec = generateInBoundsCheck(
+        xferOp, iv, builder, dim, TypeRange(xferOp.getVectorType()),
+        /*inBoundsCase=*/
+        [&](OpBuilder & /*b*/, Location loc) {
+          auto val = memref_load(xferOp.source(), indices);
+          return vector_insert_element(val, vec, ivI32.value).value;
+        },
+        /*outOfBoundsCase=*/
+        [&](OpBuilder & /*b*/, Location loc) { return vec; });
+    builder.create<scf::YieldOp>(loc, nextVec);
+  }
+
+  static Value initialLoopState(TransferReadOp xferOp) {
+    // Inititalize vector with padding value.
+    return std_splat(xferOp.getVectorType(), xferOp.padding()).value;
+  }
+};
+
+/// Codegen strategy for TransferWriteOp.
+template <>
+struct Strategy1d<TransferWriteOp> {
+  static void generateForLoopBody(OpBuilder &builder, Location loc,
+                                  TransferWriteOp xferOp, Value iv,
+                                  ValueRange /*loopState*/) {
+    SmallVector<Value, 8> indices;
+    auto dim = get1dMemrefIndices(xferOp, iv, indices);
+    auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv);
+
+    // Nothing to do in case of out-of-bounds access.
+    generateInBoundsCheck(
+        xferOp, iv, builder, dim,
+        /*inBoundsCase=*/[&](OpBuilder & /*b*/, Location loc) {
+          auto val = vector_extract_element(xferOp.vector(), ivI32.value);
+          memref_store(val, xferOp.source(), indices);
+        });
+    builder.create<scf::YieldOp>(loc);
+  }
 
-  // Fall back to a loop if the fastest varying stride is not 1 or it is
-  // permuted.
+  static Value initialLoopState(TransferWriteOp xferOp) { return Value(); }
+};
+
+/// Return true if the last dimension of the MemRefType has unit stride.
+static bool isLastMemrefDimUnitStride(MemRefType type) {
   int64_t offset;
   SmallVector<int64_t, 4> strides;
-  auto successStrides = getStridesAndOffset(memRefType, strides, offset);
-  if (succeeded(successStrides) && strides.back() == 1 &&
-      transfer.permutation_map().isMinorIdentity()) {
-    // If > 1D, emit a bunch of loops around 1-D vector transfers.
-    if (transfer.getVectorType().getRank() > 1)
-      return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options)
-          .doReplace();
-    // If 1-D this is now handled by the target-specific lowering.
-    if (transfer.getVectorType().getRank() == 1)
+  auto successStrides = getStridesAndOffset(type, strides, offset);
+  return succeeded(successStrides) && strides.back() == 1;
+}
+
+/// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
+/// necessary in cases where a 1D vector transfer op cannot be lowered into
+/// vector load/stores due to non-unit strides or broadcasts:
+///
+/// * Transfer dimension is not the last memref dimension
+/// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
+/// * Memref has a layout map with non-unit stride on the last dimension
+///
+/// This pattern generates IR as follows:
+///
+/// 1. Generate a for loop iterating over each vector element.
+/// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
+///    depending on OpTy.
+///
+/// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
+///       can be generated instead of TransferOp1dConversion. Add such a pattern
+///       to ConvertVectorToLLVM.
+///
+/// E.g.:
+/// ```
+/// vector.transfer_write %vec, %A[%a, %b]
+///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
+///    : vector<9xf32>, memref<?x?xf32>
+/// ```
+/// Is rewritten to approximately the following pseudo-IR:
+/// ```
+/// for i = 0 to 9 {
+///   %t = vector.extractelement %vec[i] : vector<9xf32>
+///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
+/// }
+/// ```
+template <typename OpTy>
+struct TransferOp1dConversion : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy xferOp,
+                                PatternRewriter &rewriter) const override {
+    ScopedContext scope(rewriter, xferOp.getLoc());
+    auto map = xferOp.permutation_map();
+    auto memRefType = xferOp.getShapedType().template dyn_cast<MemRefType>();
+
+    if (!memRefType)
       return failure();
+    if (xferOp.getVectorType().getRank() != 1)
+      return failure();
+    if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
+      return failure(); // Handled by ConvertVectorToLLVM
+
+    // Loop bounds, step, state...
+    auto vecType = xferOp.getVectorType();
+    auto lb = std_constant_index(0);
+    auto ub = std_constant_index(vecType.getDimSize(0));
+    auto step = std_constant_index(1);
+    auto loopState = Strategy1d<OpTy>::initialLoopState(xferOp);
+
+    // Generate for loop.
+    rewriter.replaceOpWithNewOp<scf::ForOp>(
+        xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange loopState) {
+          ScopedContext nestedScope(builder, loc);
+          Strategy1d<OpTy>::generateForLoopBody(builder, loc, xferOp, iv,
+                                                loopState);
+        });
+
+    return success();
   }
+};
 
-  // 1. Setup all the captures.
-  ScopedContext scope(rewriter, transfer.getLoc());
-  MemRefIndexedValue remote(transfer.source());
-  MemRefBoundsCapture memRefBoundsCapture(transfer.source());
-  Value vectorValue(transfer.vector());
-  VectorBoundsCapture vectorBoundsCapture(transfer.vector());
-  int coalescedIdx = computeCoalescedIndex(transfer);
-  // Swap the vectorBoundsCapture which will reorder loop bounds.
-  if (coalescedIdx >= 0)
-    vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
-                                   coalescedIdx);
-
-  auto lbs = vectorBoundsCapture.getLbs();
-  auto ubs = vectorBoundsCapture.getUbs();
-  SmallVector<Value, 8> steps;
-  steps.reserve(vectorBoundsCapture.getSteps().size());
-  for (auto step : vectorBoundsCapture.getSteps())
-    steps.push_back(std_constant_index(step));
-
-  // 2. Emit alloc-store-copy-dealloc.
-  Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
-  MemRefIndexedValue local(tmp);
-  Value vec = vector_type_cast(tmp);
-  memref_store(vectorValue, vec);
-  loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
-    auto ivsStorage = llvm::to_vector<8>(loopIvs);
-    // Swap the ivsStorage which will reorder memory accesses.
-    if (coalescedIdx >= 0)
-      std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
-
-    ArrayRef<Value> ivs(ivsStorage);
-    Value pos =
-        std_index_cast(IntegerType::get(op->getContext(), 32), ivs.back());
-    auto storeValue = [&](ArrayRef<Value> indices) {
-      Value scalar = vector_extract_element(local(ivs.drop_back()), pos);
-      remote(indices) = scalar;
-    };
-    emitWithBoundsChecks(
-        rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
-        memRefBoundsCapture, storeValue);
-  });
-
-  // 3. Erase.
-  rewriter.eraseOp(op);
-  return success();
-}
+} // namespace
+
+namespace mlir {
 
 void populateVectorToSCFConversionPatterns(
     RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) {
-  patterns.add<VectorTransferRewriter<vector::TransferReadOp>,
-               VectorTransferRewriter<vector::TransferWriteOp>>(
-      options, patterns.getContext());
+  if (options.unroll) {
+    patterns.add<UnrollTransferReadConversion, UnrollTransferWriteConversion>(
+        patterns.getContext());
+  } else {
+    patterns.add<PrepareTransferReadConversion, PrepareTransferWriteConversion,
+                 TransferOpConversion<TransferReadOp>,
+                 TransferOpConversion<TransferWriteOp>>(patterns.getContext());
+  }
+
+  if (kTargetRank == 1) {
+    patterns.add<TransferOp1dConversion<TransferReadOp>,
+                 TransferOp1dConversion<TransferWriteOp>>(
+        patterns.getContext());
+  }
 }
 
 } // namespace mlir

diff  --git a/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir
deleted file mode 100644
index 75ee49d75fec8..0000000000000
--- a/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir
+++ /dev/null
@@ -1,467 +0,0 @@
-// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s
-// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL
-
-// CHECK-LABEL: func @materialize_read_1d() {
-func @materialize_read_1d() {
-  %f0 = constant 0.0: f32
-  %A = memref.alloc () : memref<7x42xf32>
-  affine.for %i0 = 0 to 7 step 4 {
-    affine.for %i1 = 0 to 42 step 4 {
-      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
-      %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1)
-      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
-      %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1)
-      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
-      %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
-      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
-      // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
-      // CHECK: scf.if
-      // CHECK-NEXT: memref.load
-      // CHECK-NEXT: vector.insertelement
-      // CHECK-NEXT: scf.yield
-      // CHECK-NEXT: else
-      // CHECK-NEXT: scf.yield
-      // Add a dummy use to prevent dead code elimination from removing transfer
-      // read ops.
-      "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> ()
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @materialize_read_1d_partially_specialized
-func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
-  %f0 = constant 0.0: f32
-  %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
-  affine.for %i0 = 0 to 7 {
-    affine.for %i1 = 0 to %dyn1 {
-      affine.for %i2 = 0 to %dyn2 {
-        affine.for %i3 = 0 to 42 step 2 {
-          affine.for %i4 = 0 to %dyn4 {
-            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
-            %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3)
-            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
-            // Add a dummy use to prevent dead code elimination from removing
-            // transfer read ops.
-            "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> ()
-          }
-        }
-      }
-    }
-  }
-  // CHECK: %[[tensor:[0-9]+]] = memref.alloc
-  // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0
-  // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3
-  return
-}
-
-// -----
-
-// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-
-// CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
-func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
-  %f0 = constant 0.0: f32
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
-  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
-  // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
-  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
-  // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
-  // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
-  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
-  // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
-  // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
-  // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
-  // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
-  // CHECK:               scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK:                 scf.if
-  // CHECK:                   %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
-  // CHECK:                   scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-  // CHECK:                     %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) {
-  // CHECK:                       %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
-  // CHECK:                       %[[VIDX:.*]] = index_cast %[[I6]]
-  // CHECK:                       scf.if {{.*}} -> (vector<3xf32>) {
-  // CHECK-NEXT:                    %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
-  // CHECK-NEXT:                    %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[VIDX]] : i32] : vector<3xf32>
-  // CHECK-NEXT:                    scf.yield
-  // CHECK-NEXT:                  } else {
-  // CHECK-NEXT:                    scf.yield
-  // CHECK-NEXT:                  }
-  // CHECK-NEXT:                  scf.yield
-  // CHECK-NEXT:                }
-  // CHECK-NEXT:                memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>>
-  // CHECK-NEXT:              }
-  // CHECK-NEXT:            } else {
-  // CHECK-NEXT:              memref.store {{.*}} : memref<5xvector<4x3xf32>>
-  // CHECK-NEXT:            }
-  // CHECK-NEXT:          }
-  // CHECK-NEXT:          %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>>
-  // CHECK-NEXT:          "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> ()
-  // CHECK-NEXT:        }
-  // CHECK-NEXT:      }
-  // CHECK-NEXT:    }
-  // CHECK-NEXT:  }
-  // CHECK-NEXT:  return
-  // CHECK-NEXT:}
-
-  // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4)
-  // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...)
-  // Other dimensions are just accessed with I1, I2 resp.
-  %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
-  affine.for %i0 = 0 to %M step 3 {
-    affine.for %i1 = 0 to %N {
-      affine.for %i2 = 0 to %O {
-        affine.for %i3 = 0 to %P step 5 {
-          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
-          // Add a dummy use to prevent dead code elimination from removing
-          // transfer read ops.
-          "dummy_use"(%f) : (vector<5x4x3xf32>) -> ()
-        }
-      }
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-
-// CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
-func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
-  // CHECK-DAG:  %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32>
-  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
-  // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
-  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
-  // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
-  // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
-  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
-  // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
-  // CHECK-NEXT:   affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
-  // CHECK-NEXT:     affine.for %[[I2:.*]] = 0 to %{{.*}} {
-  // CHECK-NEXT:       affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
-  // CHECK:              memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<5x4x3xf32>>
-  // CHECK:              %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<5x4x3xf32>> to memref<5xvector<4x3xf32>>
-  // CHECK:              scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK:                scf.if
-  // CHECK:                  %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
-  // CHECK:                  %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
-  // CHECK:                  scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-  // CHECK:                    scf.if
-  // CHECK:                      %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
-  // CHECK:                      %[[VEC:.*]] = memref.load %[[VECTOR_VIEW2]][%[[I4]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK:                      scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
-  // CHECK:                        %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
-  // CHECK:                        %[[VIDX:.*]] = index_cast %[[I6]]
-  // CHECK:                        scf.if
-  // CHECK:                          %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
-  // CHECK:                          memref.store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref<?x?x?x?xf32>
-  // CHECK:                        }
-  // CHECK:                      }
-  // CHECK:                    }
-  // CHECK:                  }
-  // CHECK:                }
-  // CHECK:              }
-  // CHECK:            }
-  // CHECK:          }
-  // CHECK:        }
-  // CHECK:      }
-  // CHECK:      return
-
-  // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
-  // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
-  // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
-  // Other dimension is just accessed with I2.
-  %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
-  %f1 = constant dense<1.000000e+00> : vector<5x4x3xf32>
-  affine.for %i0 = 0 to %M step 3 {
-    affine.for %i1 = 0 to %N step 4 {
-      affine.for %i2 = 0 to %O {
-        affine.for %i3 = 0 to %P step 5 {
-          vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
-        }
-      }
-    }
-  }
-  return
-}
-
-// -----
-
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
-
-// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
-// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
-
-
-// CHECK-LABEL: transfer_read_progressive(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index
-
-// FULL-UNROLL-LABEL: transfer_read_progressive(
-//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index
-
-func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x15xf32> {
-  %f7 = constant 7.0: f32
-  // CHECK-DAG: %[[C7:.*]] = constant 7.000000e+00 : f32
-  // CHECK-DAG: %[[C0:.*]] = constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = constant 1 : index
-  // CHECK-DAG: %[[C3:.*]] = constant 3 : index
-  // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32>
-  // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
-  // CHECK:     %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
-  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
-  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK:       %[[cond1:.*]] = cmpi sgt, %[[dim]], %[[add]] : index
-  // CHECK:       scf.if %[[cond1]] {
-  // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
-  // CHECK:         memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:       } else {
-  // CHECK:         store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:       }
-  // CHECK:     }
-  // CHECK:     %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>>
-
-  // FULL-UNROLL: %[[C7:.*]] = constant 7.000000e+00 : f32
-  // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32>
-  // FULL-UNROLL: %[[C0:.*]] = constant 0 : index
-  // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
-  // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: } else {
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: }
-  // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
-  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
-  // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: } else {
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: }
-  // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
-  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
-  // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
-  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: } else {
-  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
-  // FULL-UNROLL: }
-
-  %f = vector.transfer_read %A[%base, %base], %f7 :
-    memref<?x?xf32>, vector<3x15xf32>
-
-  return %f: vector<3x15xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
-
-// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
-// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
-
-// CHECK-LABEL: transfer_write_progressive(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
-//  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
-// FULL-UNROLL-LABEL: transfer_write_progressive(
-//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
-//  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
-func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
-  // CHECK-DAG: %[[C0:.*]] = constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = constant 1 : index
-  // CHECK-DAG: %[[C3:.*]] = constant 3 : index
-  // CHECK:     %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
-  // CHECK:     memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
-  // CHECK:     %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
-  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
-  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK:       %[[cmp:.*]] = cmpi sgt, %[[dim]], %[[add]] : index
-  // CHECK:       scf.if %[[cmp]] {
-  // CHECK:         %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
-  // CHECK:       }
-  // CHECK:     }
-
-  // FULL-UNROLL: %[[C0:.*]] = constant 0 : index
-  // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // FULL-UNROLL: %[[CMP0:.*]] = cmpi sgt, %[[DIM]], %[[base]] : index
-  // FULL-UNROLL: scf.if %[[CMP0]] {
-  // FULL-UNROLL:   %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
-  // FULL-UNROLL: }
-  // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
-  // FULL-UNROLL: %[[CMP1:.*]] = cmpi sgt, %{{.*}}, %[[I1]] : index
-  // FULL-UNROLL: scf.if %[[CMP1]] {
-  // FULL-UNROLL:   %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
-  // FULL-UNROLL: }
-  // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
-  // FULL-UNROLL: %[[CMP2:.*]] = cmpi sgt, %{{.*}}, %[[I2]] : index
-  // FULL-UNROLL: scf.if %[[CMP2]] {
-  // FULL-UNROLL:   %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
-  // FULL-UNROLL: }
-
-  vector.transfer_write %vec, %A[%base, %base] :
-    vector<3x15xf32>, memref<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
-
-// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
-// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
-
-// CHECK-LABEL: transfer_write_progressive_inbounds(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
-//  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
-// FULL-UNROLL-LABEL: transfer_write_progressive_inbounds(
-//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
-//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
-//  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
-func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
-  // CHECK-NOT:    scf.if
-  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
-  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
-  // CHECK:      %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
-  // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
-  // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
-  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
-  // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
-
-  // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
-  // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
-  // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
-  // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
-  // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32>
-  // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
-  vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} :
-    vector<3x15xf32>, memref<?x?xf32>
-  return
-}
-
-// -----
-
-// FULL-UNROLL-LABEL: transfer_read_simple
-func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> {
-  %c0 = constant 0 : index
-  %f0 = constant 0.0 : f32
-  // FULL-UNROLL-DAG: %[[VC0:.*]] = constant dense<0.000000e+00> : vector<2x2xf32>
-  // FULL-UNROLL-DAG: %[[C0:.*]] = constant 0 : index
-  // FULL-UNROLL-DAG: %[[C1:.*]] = constant 1 : index
-  // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]]
-  // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32>
-  // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]]
-  // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32>
-  %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
-  return %0 : vector<2x2xf32>
-}
-
-func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32> {
-  %c0 = constant 0 : index
-  %f0 = constant 0.0 : f32
-  %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0
-    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
-      : memref<?x?x?x?xf32>, vector<3x3xf32>
-  return %0 : vector<3x3xf32>
-}
-
-// CHECK-LABEL: transfer_read_minor_identity(
-//  CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
-//  CHECK-DAG:    %[[c0:.*]] = constant 0 : index
-//  CHECK-DAG:    %[[c1:.*]] = constant 1 : index
-//  CHECK-DAG:    %[[c2:.*]] = constant 2 : index
-//  CHECK-DAG:    %[[c3:.*]] = constant 3 : index
-//  CHECK-DAG:    %[[f0:.*]] = constant 0.000000e+00 : f32
-//  CHECK-DAG:    %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32>
-//  CHECK:        %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
-//  CHECK:        %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
-//  CHECK:        scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]]
-//  CHECK:          %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
-//  CHECK:          %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg1]] : index
-//  CHECK:          scf.if %[[cmp]] {
-//  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
-//  CHECK:            memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
-//  CHECK:          } else {
-//  CHECK:            memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
-//  CHECK:          }
-//  CHECK:        }
-//  CHECK:        %[[ret:.*]]  = memref.load %[[m]][] : memref<vector<3x3xf32>>
-//  CHECK:        return %[[ret]] : vector<3x3xf32>
-
-func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) {
-  %c0 = constant 0 : index
-  %f0 = constant 0.0 : f32
-  vector.transfer_write %A, %B[%c0, %c0, %c0, %c0]
-    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
-      : vector<3x3xf32>, memref<?x?x?x?xf32>
-  return
-}
-
-// CHECK-LABEL: transfer_write_minor_identity(
-// CHECK-SAME:      %[[A:.*]]: vector<3x3xf32>,
-// CHECK-SAME:      %[[B:.*]]: memref<?x?x?x?xf32>)
-// CHECK-DAG:     %[[c0:.*]] = constant 0 : index
-// CHECK-DAG:     %[[c1:.*]] = constant 1 : index
-// CHECK-DAG:     %[[c2:.*]] = constant 2 : index
-// CHECK-DAG:     %[[c3:.*]] = constant 3 : index
-// CHECK:         %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
-// CHECK:         memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>>
-// CHECK:         %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
-// CHECK:         scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]]
-// CHECK:           %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
-// CHECK:           %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg2]] : index
-// CHECK:           scf.if %[[cmp]] {
-// CHECK:             %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
-// CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
-// CHECK:           }
-// CHECK:         }
-// CHECK:         return
-
-
-// -----
-
-func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> {
-  %c0 = constant 0 : index
-  %f0 = constant 0.0 : f32
-  %0 = vector.transfer_read %A[%c0, %c0], %f0
-      : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: transfer_read_strided(
-// CHECK: scf.for
-// CHECK: memref.load
-
-func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) {
-  %c0 = constant 0 : index
-  vector.transfer_write %A, %B[%c0, %c0] :
-    vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>
-  return
-}
-
-// CHECK-LABEL: transfer_write_strided(
-// CHECK: scf.for
-// CHECK: store
-

diff  --git a/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir
index f90d20a518a65..bd74ff05c2c32 100644
--- a/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -split-input-file -allow-unregistered-dialect | FileCheck %s
 
 // CHECK-LABEL: func @transfer_read_inbounds
 func @transfer_read_inbounds(%A : memref<?x?x?xf32>) -> (vector<2x3x4xf32>) {

diff  --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
index d84f84c5ade6e..3dce006ab7833 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
@@ -18,10 +18,9 @@ func @materialize_read_1d() {
       // CHECK: scf.if
       // CHECK-NEXT: memref.load
       // CHECK-NEXT: vector.insertelement
-      // CHECK-NEXT: store
+      // CHECK-NEXT: scf.yield
       // CHECK-NEXT: else
-      // CHECK-NEXT: vector.insertelement
-      // CHECK-NEXT: store
+      // CHECK-NEXT: scf.yield
       // Add a dummy use to prevent dead code elimination from removing transfer
       // read ops.
       "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> ()
@@ -65,37 +64,40 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d
 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   %f0 = constant 0.0: f32
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<5x4xvector<3xf32>>
+  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
   // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
   // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
   // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
   // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
-  //     CHECK:  %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
+  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
   // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
   // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
-  // CHECK-NEXT:          scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
-  // CHECK-NEXT:            scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-  // CHECK-NEXT:              scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK:                     %[[VIDX:.*]] = index_cast %[[I4]]
-  // CHECK:                     %[[VEC:.*]] = memref.load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK:                     %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
-  // CHECK:                     %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
-  // CHECK-NEXT:                scf.if
-  // CHECK-NEXT:                  %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
-  // CHECK-NEXT:                  %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
-  // CHECK-NEXT:                  store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK-NEXT:                } else {
-  // CHECK-NEXT:                  %[[CVEC:.*]] = vector.insertelement
-  // CHECK-NEXT:                  store %[[CVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK:               scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK:                 scf.if
+  // CHECK:                   %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
+  // CHECK:                   scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
+  // CHECK:                     %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) {
+  // CHECK:                       %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
+  // CHECK:                       %[[VIDX:.*]] = index_cast %[[I6]]
+  // CHECK:                       scf.if {{.*}} -> (vector<3xf32>) {
+  // CHECK-NEXT:                    %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
+  // CHECK-NEXT:                    %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[VIDX]] : i32] : vector<3xf32>
+  // CHECK-NEXT:                    scf.yield
+  // CHECK-NEXT:                  } else {
+  // CHECK-NEXT:                    scf.yield
+  // CHECK-NEXT:                  }
+  // CHECK-NEXT:                  scf.yield
   // CHECK-NEXT:                }
+  // CHECK-NEXT:                memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>>
   // CHECK-NEXT:              }
+  // CHECK-NEXT:            } else {
+  // CHECK-NEXT:              memref.store {{.*}} : memref<5xvector<4x3xf32>>
   // CHECK-NEXT:            }
   // CHECK-NEXT:          }
-  // CHECK-NEXT:          %[[ALLOC_CAST:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4xvector<3xf32>> to memref<vector<5x4x3xf32>>
-  // CHECK-NEXT:          %[[LD:.*]] = memref.load %[[ALLOC_CAST]][] : memref<vector<5x4x3xf32>>
+  // CHECK-NEXT:          %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>>
   // CHECK-NEXT:          "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> ()
   // CHECK-NEXT:        }
   // CHECK-NEXT:      }
@@ -129,42 +131,46 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
 
 // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<5x4xvector<3xf32>>
+  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK-DAG:  %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32>
   // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
   // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
   // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
   // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
   // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
-  //     CHECK:  %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
-  // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
-  // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
-  // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
-  // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
-  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4xvector<3xf32>>
-  //      CHECK:          store %{{.*}}, {{.*}} : memref<vector<5x4x3xf32>>
-  // CHECK-NEXT:          scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
-  // CHECK-NEXT:            scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
-  // CHECK-NEXT:              scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK:                     %[[VIDX:.*]] = index_cast %[[I4]]
-  // CHECK:                     %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
-  // CHECK:                     %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
-  // CHECK:                     %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
-  // CHECK-NEXT:                scf.if
-  // CHECK-NEXT:                  %[[VEC:.*]] = memref.load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK-NEXT:                  %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
-  //      CHECK:                  store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref<?x?x?x?xf32>
-  // CHECK-NEXT:                }
-  // CHECK-NEXT:              }
-  // CHECK-NEXT:            }
-  // CHECK-NEXT:          }
-  // CHECK-NEXT:        }
-  // CHECK-NEXT:      }
-  // CHECK-NEXT:    }
-  // CHECK-NEXT:  }
-  // CHECK-NEXT:  return
-  // CHECK-NEXT:}
-  //
+  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
+  // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
+  // CHECK-NEXT:   affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
+  // CHECK-NEXT:     affine.for %[[I2:.*]] = 0 to %{{.*}} {
+  // CHECK-NEXT:       affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
+  // CHECK:              memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<5x4x3xf32>>
+  // CHECK:              %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<5x4x3xf32>> to memref<5xvector<4x3xf32>>
+  // CHECK:              scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK:                scf.if
+  // CHECK:                  %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
+  // CHECK:                  %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
+  // CHECK:                  scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
+  // CHECK:                    scf.if
+  // CHECK:                      %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
+  // CHECK:                      %[[VEC:.*]] = memref.load %[[VECTOR_VIEW2]][%[[I4]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK:                      scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
+  // CHECK:                        %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
+  // CHECK:                        %[[VIDX:.*]] = index_cast %[[I6]]
+  // CHECK:                        scf.if
+  // CHECK:                          %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
+  // CHECK:                          memref.store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref<?x?x?x?xf32>
+  // CHECK:                        }
+  // CHECK:                      }
+  // CHECK:                    }
+  // CHECK:                  }
+  // CHECK:                }
+  // CHECK:              }
+  // CHECK:            }
+  // CHECK:          }
+  // CHECK:        }
+  // CHECK:      }
+  // CHECK:      return
+
   // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
   // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
   // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
@@ -203,53 +209,52 @@ func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x
   %f7 = constant 7.0: f32
   // CHECK-DAG: %[[C7:.*]] = constant 7.000000e+00 : f32
   // CHECK-DAG: %[[C0:.*]] = constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = constant 1 : index
+  // CHECK-DAG: %[[C3:.*]] = constant 3 : index
   // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32>
-  // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>>
-  // CHECK-DAG: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // CHECK: affine.for %[[I:.*]] = 0 to 3 {
-  // CHECK:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK:   %[[cond1:.*]] = cmpi slt, %[[add]], %[[dim]] : index
-  // CHECK:   scf.if %[[cond1]] {
-  // CHECK:     %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
-  // CHECK:     store %[[vec_1d]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:   } else {
-  // CHECK:     store %[[splat]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:   }
-  // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref<vector<3x15xf32>>
-  // CHECK: %[[cst:.*]] = memref.load %[[vmemref]][] : memref<vector<3x15xf32>>
+  // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
+  // CHECK:     %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
+  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
+  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
+  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
+  // CHECK:       %[[cond1:.*]] = cmpi sgt, %[[dim]], %[[add]] : index
+  // CHECK:       scf.if %[[cond1]] {
+  // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
+  // CHECK:         memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
+  // CHECK:       } else {
+  // CHECK:         store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
+  // CHECK:       }
+  // CHECK:     }
+  // CHECK:     %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>>
 
   // FULL-UNROLL: %[[C7:.*]] = constant 7.000000e+00 : f32
   // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32>
   // FULL-UNROLL: %[[C0:.*]] = constant 0 : index
-  // FULL-UNROLL: %[[SPLAT:.*]] = constant dense<7.000000e+00> : vector<15xf32>
   // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // FULL-UNROLL: cmpi slt, %[[base]], %[[DIM]] : index
+  // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
   // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
   // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
-  // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index
+  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
   // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
   // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
-  // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index
+  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
   // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
   // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: } else {
-  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
   // FULL-UNROLL: }
 
@@ -275,37 +280,40 @@ func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x
 //  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 //  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
-  // CHECK: %[[C0:.*]] = constant 0 : index
-  // CHECK: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>>
-  // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref<vector<3x15xf32>>
-  // CHECK: store %[[vec]], %[[vmemref]][] : memref<vector<3x15xf32>>
-  // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // CHECK: affine.for %[[I:.*]] = 0 to 3 {
-  // CHECK:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK:   %[[cmp:.*]] = cmpi slt, %[[add]], %[[dim]] : index
-  // CHECK:   scf.if %[[cmp]] {
-  // CHECK:     %[[vec_1d:.*]] = memref.load %0[%[[I]]] : memref<3xvector<15xf32>>
-  // CHECK:     vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
-  // CHECK:   }
+  // CHECK-DAG: %[[C0:.*]] = constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = constant 1 : index
+  // CHECK-DAG: %[[C3:.*]] = constant 3 : index
+  // CHECK:     %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
+  // CHECK:     memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
+  // CHECK:     %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
+  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
+  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
+  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
+  // CHECK:       %[[cmp:.*]] = cmpi sgt, %[[dim]], %[[add]] : index
+  // CHECK:       scf.if %[[cmp]] {
+  // CHECK:         %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
+  // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // CHECK:       }
+  // CHECK:     }
 
   // FULL-UNROLL: %[[C0:.*]] = constant 0 : index
   // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
-  // FULL-UNROLL: %[[CMP0:.*]] = cmpi slt, %[[base]], %[[DIM]] : index
+  // FULL-UNROLL: %[[CMP0:.*]] = cmpi sgt, %[[DIM]], %[[base]] : index
   // FULL-UNROLL: scf.if %[[CMP0]] {
   // FULL-UNROLL:   %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
   // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
-  // FULL-UNROLL: %[[CMP1:.*]] = cmpi slt, %[[I1]], %[[DIM]] : index
+  // FULL-UNROLL: %[[CMP1:.*]] = cmpi sgt, %{{.*}}, %[[I1]] : index
   // FULL-UNROLL: scf.if %[[CMP1]] {
   // FULL-UNROLL:   %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%[[I1]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
   // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
-  // FULL-UNROLL: %[[CMP2:.*]] = cmpi slt, %[[I2]], %[[DIM]] : index
+  // FULL-UNROLL: %[[CMP2:.*]] = cmpi sgt, %{{.*}}, %[[I2]] : index
   // FULL-UNROLL: scf.if %[[CMP2]] {
   // FULL-UNROLL:   %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32>
-  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%[[I2]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
   // FULL-UNROLL: }
 
   vector.transfer_write %vec, %A[%base, %base] :
@@ -330,12 +338,14 @@ func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vecto
 //  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
   // CHECK-NOT:    scf.if
-  // CHECK-NEXT: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>>
-  // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref<vector<3x15xf32>>
-  // CHECK-NEXT: store %[[vec]], %[[vmemref]][] : memref<vector<3x15xf32>>
-  // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 3 {
+  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
+  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
+  // CHECK:      %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
+  // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
+  // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
+  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
   // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
-  // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %0[%[[I]]] : memref<3xvector<15xf32>>
+  // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
   // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
 
   // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32>
@@ -378,25 +388,27 @@ func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32>
 }
 
 // CHECK-LABEL: transfer_read_minor_identity(
-//  CHECK-SAME:   %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
-//  CHECK-DAG:   %[[c0:.*]] = constant 0 : index
-//  CHECK-DAG:   %[[f0:.*]] = constant 0.000000e+00 : f32
-//       CHECK-DAG:   %[[c2:.*]] = constant 2 : index
-//       CHECK-DAG:   %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32>
-//       CHECK:   %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>>
-//       CHECK:   %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECK:   affine.for %[[arg1:.*]] = 0 to 3 {
-//       CHECK:      %[[cmp:.*]] = cmpi slt, %[[arg1]], %[[d]] : index
-//       CHECK:      scf.if %[[cmp]] {
-//       CHECK:        %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
-//       CHECK:        store %[[tr]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>>
-//       CHECK:      } else {
-//       CHECK:        store %[[cst0]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>>
-//       CHECK:      }
-//       CHECK:    }
-//       CHECK:    %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref<vector<3x3xf32>>
-//       CHECK:    %[[ret:.*]]  = memref.load %[[cast]][] : memref<vector<3x3xf32>>
-//       CHECK:    return %[[ret]] : vector<3x3xf32>
+//  CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
+//  CHECK-DAG:    %[[c0:.*]] = constant 0 : index
+//  CHECK-DAG:    %[[c1:.*]] = constant 1 : index
+//  CHECK-DAG:    %[[c2:.*]] = constant 2 : index
+//  CHECK-DAG:    %[[c3:.*]] = constant 3 : index
+//  CHECK-DAG:    %[[f0:.*]] = constant 0.000000e+00 : f32
+//  CHECK-DAG:    %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32>
+//  CHECK:        %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
+//  CHECK:        %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
+//  CHECK:        scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]]
+//  CHECK:          %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
+//  CHECK:          %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg1]] : index
+//  CHECK:          scf.if %[[cmp]] {
+//  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
+//  CHECK:            memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
+//  CHECK:          } else {
+//  CHECK:            memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
+//  CHECK:          }
+//  CHECK:        }
+//  CHECK:        %[[ret:.*]]  = memref.load %[[m]][] : memref<vector<3x3xf32>>
+//  CHECK:        return %[[ret]] : vector<3x3xf32>
 
 func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) {
   %c0 = constant 0 : index
@@ -408,22 +420,25 @@ func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf3
 }
 
 // CHECK-LABEL: transfer_write_minor_identity(
-//  CHECK-SAME:   %[[A:.*]]: vector<3x3xf32>,
-//  CHECK-SAME:   %[[B:.*]]: memref<?x?x?x?xf32>)
-//       CHECK-DAG:   %[[c2:.*]] = constant 2 : index
-//       CHECK-DAG:   %[[c0:.*]] = constant 0 : index
-//       CHECK:   %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>>
-//       CHECK:   %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref<vector<3x3xf32>>
-//       CHECK:   store %[[A]], %[[cast]][] : memref<vector<3x3xf32>>
-//       CHECK:   %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
-//       CHECK:   affine.for %[[arg2:.*]] = 0 to 3 {
-//       CHECK:      %[[cmp:.*]] = cmpi slt, %[[arg2]], %[[d]] : index
-//       CHECK:      scf.if %[[cmp]] {
-//       CHECK:        %[[tmp:.*]] = memref.load %[[m]][%[[arg2]]] : memref<3xvector<3xf32>>
-//       CHECK:        vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
-//       CHECK:      }
-//       CHECK:    }
-//       CHECK:    return
+// CHECK-SAME:      %[[A:.*]]: vector<3x3xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<?x?x?x?xf32>)
+// CHECK-DAG:     %[[c0:.*]] = constant 0 : index
+// CHECK-DAG:     %[[c1:.*]] = constant 1 : index
+// CHECK-DAG:     %[[c2:.*]] = constant 2 : index
+// CHECK-DAG:     %[[c3:.*]] = constant 3 : index
+// CHECK:         %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
+// CHECK:         memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>>
+// CHECK:         %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
+// CHECK:         scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]]
+// CHECK:           %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
+// CHECK:           %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg2]] : index
+// CHECK:           scf.if %[[cmp]] {
+// CHECK:             %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
+// CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
+// CHECK:           }
+// CHECK:         }
+// CHECK:         return
+
 
 // -----
 

diff  --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
index d7dc9d6f1e594..20216cc6ba6e1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s

diff  --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
index 1fc11fab85286..03cdc3dd8e329 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s

diff  --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
index 6de89a6cd6ac5..00da9278d50c7 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 

diff  --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir
index bed94f02920ab..98d8132ec5ca5 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir
@@ -1,4 +1,9 @@
-// RUN: mlir-opt %s -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
+// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s

diff  --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
index 9488534d3e93d..5fdaeafe54482 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir
@@ -3,7 +3,7 @@
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext,%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext,%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s

diff  --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index d1ac5e1b994fe..d60f32d5f6cdf 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -9,7 +9,6 @@
 #include <type_traits>
 
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -390,23 +389,6 @@ struct TestVectorMultiReductionLoweringPatterns
   }
 };
 
-template <bool Unroll>
-struct TestProgressiveVectorToSCFLoweringPatterns
-    : public PassWrapper<TestProgressiveVectorToSCFLoweringPatterns<Unroll>,
-                         FunctionPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<memref::MemRefDialect, scf::SCFDialect, AffineDialect>();
-  }
-  void runOnFunction() override {
-    RewritePatternSet patterns(&this->getContext());
-    ProgressiveVectorTransferToSCFOptions options;
-    options.unroll = Unroll;
-    populateProgressiveVectorToSCFConversionPatterns(patterns, options);
-    (void)applyPatternsAndFoldGreedily(this->getFunction(),
-                                       std::move(patterns));
-  }
-};
-
 } // end anonymous namespace
 
 namespace mlir {
@@ -454,19 +436,6 @@ void registerTestVectorConversions() {
       "test-vector-transfer-lowering-patterns",
       "Test conversion patterns to lower transfer ops to other vector ops");
 
-  PassRegistration<TestProgressiveVectorToSCFLoweringPatterns<
-      /*Unroll=*/false>>
-      transferOpToSCF("test-progressive-convert-vector-to-scf",
-                      "Test conversion patterns to progressively lower "
-                      "transfer ops to SCF");
-
-  PassRegistration<TestProgressiveVectorToSCFLoweringPatterns<
-      /*Unroll=*/true>>
-      transferOpToSCFUnrolled(
-          "test-unrolled-progressive-convert-vector-to-scf",
-          "Test conversion patterns to progressively lower transfer ops to SCF"
-          "(unrolled variant)");
-
   PassRegistration<TestVectorMultiReductionLoweringPatterns>
       multiDimReductionOpLoweringPass(
           "test-vector-multi-reduction-lowering-patterns",