[Mlir-commits] [mlir] d80b04a - [mlir][Affine][Vector] Support vectorizing reduction loops

Wed May 5 09:13:17 PDT 2021

Author: Sergei Grechanik
Date: 2021-05-05T09:03:59-07:00
New Revision: d80b04ab0015b218b613f8fe59506d45739817b8

URL: https://github.com/llvm/llvm-project/commit/d80b04ab0015b218b613f8fe59506d45739817b8
DIFF: https://github.com/llvm/llvm-project/commit/d80b04ab0015b218b613f8fe59506d45739817b8.diff

LOG: [mlir][Affine][Vector] Support vectorizing reduction loops

This patch adds support for vectorizing loops with 'iter_args'
implementing known reductions along the vector dimension. Comparing to
the non-vector-dimension case, two additional things are done during
vectorization of such loops:
- The resulting vector returned from the loop is reduced to a scalar
  using `vector.reduce`.
- In some cases a mask is applied to the vector yielded at the end of
  the loop to prevent garbage values from being written to the
  accumulator.

Vectorization of reduction loops is disabled by default. To enable it, a
map from loops to array of reduction descriptors should be explicitly passed to
`vectorizeAffineLoops`, or `vectorize-reductions=true` should be passed
to the SuperVectorize pass.

Current limitations:
- Loops with a non-unit step size are not supported.
- n-D vectorization with n > 1 is not supported.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D100694

Added: 
    mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
    mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

Modified: 
    mlir/include/mlir/Analysis/AffineAnalysis.h
    mlir/include/mlir/Dialect/Affine/Passes.td
    mlir/include/mlir/Dialect/Affine/Utils.h
    mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
    mlir/include/mlir/Dialect/Vector/VectorOps.h
    mlir/lib/Analysis/AffineAnalysis.cpp
    mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
    mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
    mlir/lib/Dialect/StandardOps/IR/Ops.cpp
    mlir/lib/Dialect/Vector/VectorOps.cpp
    mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h
index 2a4bd478a513c..a0fae59939052 100644

--- a/mlir/include/mlir/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -47,6 +47,11 @@ bool isLoopParallel(
     AffineForOp forOp,
     SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
 
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool isLoopMemoryParallel(AffineForOp forOp);
+
 /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
 /// Operations that are reachable via a search starting from `operands` and
 /// ending at those operands that are not the result of an AffineApplyOp.

diff  --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 45c28bbace4ae..837ebb4f0340e 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -112,7 +112,11 @@ def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
                "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
                "dimensions to match. See defaultPatterns in Vectorize.cpp for "
                "a description and examples. This is used for testing purposes",
-               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    Option<"vectorizeReductions", "vectorize-reductions", "bool",
+           /*default=*/"false",
+           "Vectorize known reductions expressed via iter_args. "
+           "Switched off by default.">
   ];
 }
 

diff  --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index 676b394398106..693af77c6e86b 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_AFFINE_UTILS_H
 #define MLIR_DIALECT_AFFINE_UTILS_H
 
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
@@ -27,6 +28,8 @@ struct LogicalResult;
 struct LoopReduction;
 class Operation;
 
+using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
+
 /// Replaces parallel affine.for op with 1-d affine.parallel op.
 /// mlir::isLoopParallel detects the parallel affine.for ops.
 /// Parallelizes the specified reductions. Parallelization will fail in presence
@@ -81,16 +84,23 @@ struct VectorizationStrategy {
   // The candidate will be vectorized using the vectorization factor in
   // 'vectorSizes' for that dimension.
   DenseMap<Operation *, unsigned> loopToVectorDim;
+  // Maps loops that implement vectorizable reductions to the corresponding
+  // reduction descriptors.
+  ReductionLoopMap reductionLoops;
 };
 
 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
 /// 'vectorSizes'. By default, each vectorization factor is applied
 /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
 /// be optionally used to provide a 
diff erent loop vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
 void vectorizeAffineLoops(
     Operation *parentOp,
     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
-    ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
+    ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
+    const ReductionLoopMap &reductionLoops = ReductionLoopMap());
 
 /// External utility to vectorize affine loops from a single loop nest using an
 /// n-D vectorization strategy (see doc in VectorizationStrategy definition).

diff  --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
index ce3087a176b3c..56478b2734848 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
@@ -124,6 +124,20 @@ bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,
 /// Ignore integer bitwitdh and type mismatch that come from the fact there is
 /// no IndexAttr and that IndexType have no bitwidth.
 bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);
+
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+                               OpBuilder &builder, Location loc);
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value getIdentityValue(AtomicRMWKind op, Type resultType, OpBuilder &builder,
+                       Location loc);
+
+/// Returns the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                     Value lhs, Value rhs);
+
 } // end namespace mlir
 
 #endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H

diff  --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h
index 20c6e9601166c..d0e65a1d1c943 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
 #define MLIR_DIALECT_VECTOR_VECTOROPS_H
 
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -192,6 +193,11 @@ IntegerType getVectorSubscriptType(Builder &builder);
 /// the integer type required for subscripts in the vector dialect.
 ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
 
+/// Returns the value obtained by reducing the vector into a scalar using the
+/// operation kind associated with a binary AtomicRMWKind op.
+Value getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                           Value vector);
+
 namespace impl {
 /// Build the default minor identity map suitable for a vector transfer. This
 /// also handles the case memref<... x vector<...>> -> vector<...> in which the

diff  --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index 396ab89b5b526..a4a9d51e91591 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -70,6 +70,8 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
     return nullptr;
   if (!forOp.getRegionIterArgs()[pos].hasOneUse())
     return nullptr;
+  if (!yielded.hasOneUse())
+    return nullptr;
 
   Optional<AtomicRMWKind> maybeKind =
       TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
@@ -123,6 +125,14 @@ bool mlir::isLoopParallel(AffineForOp forOp,
       return false;
   }
 
+  // Check memory dependences.
+  return isLoopMemoryParallel(forOp);
+}
+
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
   // Collect all load and store ops in loop nest rooted at 'forOp'.
   SmallVector<Operation *, 8> loadAndStoreOps;
   auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {

diff  --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index e3aebfbe17889..dd3ff5fefa3bc 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -367,49 +367,6 @@ class AffineForLowering : public OpRewritePattern<AffineForOp> {
   }
 };
 
-/// Returns the identity value associated with an AtomicRMWKind op.
-static Value getIdentityValue(AtomicRMWKind op, Type resultType,
-                              OpBuilder &builder, Location loc) {
-  switch (op) {
-  case AtomicRMWKind::addf:
-    return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 0));
-  case AtomicRMWKind::addi:
-    return builder.create<ConstantOp>(loc,
-                                      builder.getIntegerAttr(resultType, 0));
-  case AtomicRMWKind::mulf:
-    return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 1));
-  case AtomicRMWKind::muli:
-    return builder.create<ConstantOp>(loc,
-                                      builder.getIntegerAttr(resultType, 1));
-  // TODO: Add remaining reduction operations.
-  default:
-    (void)emitOptionalError(loc, "Reduction operation type not supported");
-    break;
-  }
-  return nullptr;
-}
-
-/// Return the value obtained by applying the reduction operation kind
-/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
-static Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
-                            Value lhs, Value rhs) {
-  switch (op) {
-  case AtomicRMWKind::addf:
-    return builder.create<AddFOp>(loc, lhs, rhs);
-  case AtomicRMWKind::addi:
-    return builder.create<AddIOp>(loc, lhs, rhs);
-  case AtomicRMWKind::mulf:
-    return builder.create<MulFOp>(loc, lhs, rhs);
-  case AtomicRMWKind::muli:
-    return builder.create<MulIOp>(loc, lhs, rhs);
-  // TODO: Add remaining reduction operations.
-  default:
-    (void)emitOptionalError(loc, "Reduction operation type not supported");
-    break;
-  }
-  return nullptr;
-}
-
 /// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`
 /// operation.
 class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {

diff  --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index 551a8d44042ed..570f2922d5809 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 
 using namespace mlir;
@@ -324,7 +325,7 @@ using namespace vector;
 /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
 /// ========================================================================
 ///   1. lowering to concrete vector types for various HW;
-///   2. reduction support;
+///   2. reduction support for n-D vectorization and non-unit steps;
 ///   3. non-effecting padding during vector.transfer_read and filter during
 ///      vector.transfer_write;
 ///   4. misalignment support vector.transfer_read / vector.transfer_write
@@ -487,6 +488,73 @@ using namespace vector;
 ///
 /// Of course, much more intricate n-D imperfectly-nested patterns can be
 /// vectorized too and specified in a fully declarative fashion.
+///
+/// Reduction:
+/// ==========
+/// Vectorizing reduction loops along the reduction dimension is supported if:
+/// - the reduction kind is supported,
+/// - the vectorization is 1-D, and
+/// - the step size of the loop equals to one.
+///
+/// Comparing to the non-vector-dimension case, two additional things are done
+/// during vectorization of such loops:
+/// - The resulting vector returned from the loop is reduced to a scalar using
+///   `vector.reduce`.
+/// - In some cases a mask is applied to the vector yielded at the end of the
+///   loop to prevent garbage values from being written to the accumulator.
+///
+/// Reduction vectorization is switched off by default, it can be enabled by
+/// passing a map from loops to reductions to utility functions, or by passing
+/// `vectorize-reductions=true` to the vectorization pass.
+///
+/// Consider the following example:
+/// ```mlir
+/// func @vecred(%in: memref<512xf32>) -> f32 {
+///   %cst = constant 0.000000e+00 : f32
+///   %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
+///     %ld = affine.load %in[%i] : memref<512xf32>
+///     %cos = math.cos %ld : f32
+///     %add = addf %part_sum, %cos : f32
+///     affine.yield %add : f32
+///   }
+///   return %sum : f32
+/// }
+/// ```
+///
+/// The -affine-vectorize pass with the following arguments:
+/// ```
+/// -affine-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
+///                    vectorize-reductions=true"
+/// ```
+/// produces the following output:
+/// ```mlir
+/// #map = affine_map<(d0) -> (-d0 + 500)>
+/// func @vecred(%arg0: memref<512xf32>) -> f32 {
+///   %cst = constant 0.000000e+00 : f32
+///   %cst_0 = constant dense<0.000000e+00> : vector<128xf32>
+///   %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
+///           -> (vector<128xf32>) {
+///     // %2 is the number of iterations left in the original loop.
+///     %2 = affine.apply #map(%arg1)
+///     %3 = vector.create_mask %2 : vector<128xi1>
+///     %cst_1 = constant 0.000000e+00 : f32
+///     %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
+///                     memref<512xf32>, vector<128xf32>
+///     %5 = math.cos %4 : vector<128xf32>
+///     %6 = addf %arg2, %5 : vector<128xf32>
+///     // We filter out the effect of last 12 elements using the mask.
+///     %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
+///     affine.yield %7 : vector<128xf32>
+///   }
+///   %1 = vector.reduction "add", %0 : vector<128xf32> into f32
+///   return %1 : f32
+/// }
+/// ```
+///
+/// Note that because of loop misalignment we needed to apply a mask to prevent
+/// last 12 elements from affecting the final result. The mask is full of ones
+/// in every iteration except for the last one, in which it has the form
+/// `11...100...0` with 116 ones and 12 zeros.
 
 #define DEBUG_TYPE "early-vect"
 
@@ -644,6 +712,17 @@ struct VectorizationState {
   void registerValueScalarReplacement(BlockArgument replaced,
                                       BlockArgument replacement);
 
+  /// Registers the scalar replacement of a scalar result returned from a
+  /// reduction loop. 'replacement' must be scalar.
+  ///
+  /// This utility is used to register the replacement for scalar results of
+  /// vectorized reduction loops with iter_args.
+  ///
+  /// Example 2:
+  ///   * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+  ///   * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+  void registerLoopResultScalarReplacement(Value replaced, Value replacement);
+
   /// Returns in 'replacedVals' the scalar replacement for values in
   /// 'inputVals'.
   void getScalarValueReplacementsFor(ValueRange inputVals,
@@ -663,10 +742,16 @@ struct VectorizationState {
   // Maps input scalar values to their new scalar counterparts in the vector
   // loop nest.
   BlockAndValueMapping valueScalarReplacement;
+  // Maps results of reduction loops to their new scalar counterparts.
+  DenseMap<Value, Value> loopResultScalarReplacement;
 
   // Maps the newly created vector loops to their vector dimension.
   DenseMap<Operation *, unsigned> vecLoopToVecDim;
 
+  // Maps the new vectorized loops to the corresponding vector masks if it is
+  // required.
+  DenseMap<Operation *, Value> vecLoopToMask;
+
   // The strategy drives which loop to vectorize by which amount.
   const VectorizationStrategy *strategy;
 
@@ -761,6 +846,26 @@ void VectorizationState::registerValueScalarReplacement(
   registerValueScalarReplacementImpl(replaced, replacement);
 }
 
+/// Registers the scalar replacement of a scalar result returned from a
+/// reduction loop. 'replacement' must be scalar.
+///
+/// This utility is used to register the replacement for scalar results of
+/// vectorized reduction loops with iter_args.
+///
+/// Example 2:
+///   * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+///   * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+void VectorizationState::registerLoopResultScalarReplacement(
+    Value replaced, Value replacement) {
+  assert(isa<AffineForOp>(replaced.getDefiningOp()));
+  assert(loopResultScalarReplacement.count(replaced) == 0 &&
+         "already registered");
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
+                       "with scalar: "
+                    << replacement);
+  loopResultScalarReplacement[replaced] = replacement;
+}
+
 void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
                                                             Value replacement) {
   assert(!valueScalarReplacement.contains(replaced) &&
@@ -850,6 +955,101 @@ static ConstantOp vectorizeConstant(ConstantOp constOp,
   return newConstOp;
 }
 
+/// Creates a constant vector filled with the neutral elements of the given
+/// reduction. The scalar type of vector elements will be taken from
+/// `oldOperand`.
+static ConstantOp createInitialVector(AtomicRMWKind reductionKind,
+                                      Value oldOperand,
+                                      VectorizationState &state) {
+  Type scalarTy = oldOperand.getType();
+  if (!VectorType::isValidElementType(scalarTy))
+    return nullptr;
+
+  Attribute valueAttr = getIdentityValueAttr(
+      reductionKind, scalarTy, state.builder, oldOperand.getLoc());
+  auto vecTy = getVectorType(scalarTy, state.strategy);
+  auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
+  auto newConstOp =
+      state.builder.create<ConstantOp>(oldOperand.getLoc(), vecAttr);
+
+  return newConstOp;
+}
+
+/// Creates a mask used to filter out garbage elements in the last iteration
+/// of unaligned loops. If a mask is not required then `nullptr` is returned.
+/// The mask will be a vector of booleans representing meaningful vector
+/// elements in the current iteration. It is filled with ones for each iteration
+/// except for the last one, where it has the form `11...100...0` with the
+/// number of ones equal to the number of meaningful elements (i.e. the number
+/// of iterations that would be left in the original loop).
+static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
+  assert(state.strategy->vectorSizes.size() == 1 &&
+         "Creating a mask non-1-D vectors is not supported.");
+  assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
+         "Creating a mask for loops with non-unit original step size is not "
+         "supported.");
+
+  // Check if we have already created the mask.
+  if (Value mask = state.vecLoopToMask.lookup(vecForOp))
+    return mask;
+
+  // If the loop has constant bounds and the original number of iterations is
+  // divisable by the vector size then we don't need a mask.
+  if (vecForOp.hasConstantBounds()) {
+    int64_t originalTripCount =
+        vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
+    if (originalTripCount % vecForOp.getStep() == 0)
+      return nullptr;
+  }
+
+  OpBuilder::InsertionGuard guard(state.builder);
+  state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+  // We generate the mask using the `vector.create_mask` operation which accepts
+  // the number of meaningful elements (i.e. the legth of the prefix of 1s).
+  // To compute the number of meaningful elements we subtract the current value
+  // of the iteration variable from the upper bound of the loop. Example:
+  //
+  //     // 500 is the upper bound of the loop
+  //     #map = affine_map<(d0) -> (500 - d0)>
+  //     %elems_left = affine.apply #map(%iv)
+  //     %mask = vector.create_mask %elems_left : vector<128xi1>
+
+  Location loc = vecForOp.getLoc();
+
+  // First we get the upper bound of the loop using `affine.apply` or
+  // `affine.min`.
+  AffineMap ubMap = vecForOp.getUpperBoundMap();
+  Value ub;
+  if (ubMap.getNumResults() == 1)
+    ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
+                                             vecForOp.getUpperBoundOperands());
+  else
+    ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
+                                           vecForOp.getUpperBoundOperands());
+  // Then we compute the number of (original) iterations left in the loop.
+  AffineExpr subExpr =
+      state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
+  Value itersLeft =
+      makeComposedAffineApply(state.builder, loc, AffineMap::get(2, 0, subExpr),
+                              {ub, vecForOp.getInductionVar()});
+  // If the affine maps were successfully composed then `ub` is unneeded.
+  if (ub.use_empty())
+    ub.getDefiningOp()->erase();
+  // Finally we create the mask.
+  Type maskTy = VectorType::get(state.strategy->vectorSizes,
+                                state.builder.getIntegerType(1));
+  Value mask =
+      state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);
+
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
+                    << itersLeft << "\n"
+                    << mask << "\n");
+
+  state.vecLoopToMask[vecForOp] = mask;
+  return mask;
+}
+
 /// Returns true if the provided value is vector uniform given the vectorization
 /// strategy.
 // TODO: For now, only values that are invariants to all the loops in the
@@ -1023,26 +1223,40 @@ static Operation *vectorizeAffineStore(AffineStoreOp storeOp,
   return transfer;
 }
 
+/// Returns true if `value` is a constant equal to the neutral element of the
+/// given vectorizable reduction.
+static bool isNeutralElementConst(AtomicRMWKind reductionKind, Value value,
+                                  VectorizationState &state) {
+  Type scalarTy = value.getType();
+  if (!VectorType::isValidElementType(scalarTy))
+    return false;
+  Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy,
+                                             state.builder, value.getLoc());
+  if (auto constOp = dyn_cast_or_null<ConstantOp>(value.getDefiningOp()))
+    return constOp.value() == valueAttr;
+  return false;
+}
+
 /// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
 /// created and registered as replacement for the scalar loop. The builder's
 /// insertion point is set to the new loop's body so that subsequent vectorized
 /// operations are inserted into the new loop. If the loop is a vector
 /// dimension, the step of the newly created loop will reflect the vectorization
 /// factor used to vectorized that dimension.
-// TODO: Add support for 'iter_args'. Related operands and results will be
-// vectorized at this point.
 static Operation *vectorizeAffineForOp(AffineForOp forOp,
                                        VectorizationState &state) {
   const VectorizationStrategy &strategy = *state.strategy;
   auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
   bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
 
-  // We only support 'iter_args' when the loop is not one of the vector
-  // dimensions.
-  // TODO: Support vector dimension loops. They require special handling:
-  // generate horizontal reduction, last-value extraction, etc.
-  if (forOp.getNumIterOperands() > 0 && isLoopVecDim)
+  // TODO: Vectorization of reduction loops is not supported for non-unit steps.
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
+    LLVM_DEBUG(
+        dbgs()
+        << "\n[early-vect]+++++ unsupported step size for reduction loop: "
+        << forOp.getStep() << "\n");
     return nullptr;
+  }
 
   // If we are vectorizing a vector dimension, compute a new step for the new
   // vectorized loop using the vectorization factor for the vector dimension.
@@ -1057,10 +1271,31 @@ static Operation *vectorizeAffineForOp(AffineForOp forOp,
     newStep = forOp.getStep();
   }
 
+  // Get information about reduction kinds.
+  ArrayRef<LoopReduction> reductions;
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
+    auto it = strategy.reductionLoops.find(forOp);
+    assert(it != strategy.reductionLoops.end() &&
+           "Reduction descriptors not found when vectorizing a reduction loop");
+    reductions = it->second;
+    assert(reductions.size() == forOp.getNumIterOperands() &&
+           "The size of reductions array must match the number of iter_args");
+  }
+
   // Vectorize 'iter_args'.
   SmallVector<Value, 8> vecIterOperands;
-  for (auto operand : forOp.getIterOperands())
-    vecIterOperands.push_back(vectorizeOperand(operand, state));
+  if (!isLoopVecDim) {
+    for (auto operand : forOp.getIterOperands())
+      vecIterOperands.push_back(vectorizeOperand(operand, state));
+  } else {
+    // For reduction loops we need to pass a vector of neutral elements as an
+    // initial value of the accumulator. We will add the original initial value
+    // later.
+    for (auto redAndOperand : llvm::zip(reductions, forOp.getIterOperands())) {
+      vecIterOperands.push_back(createInitialVector(
+          std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));
+    }
+  }
 
   auto vecForOp = state.builder.create<AffineForOp>(
       forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
@@ -1075,13 +1310,16 @@ static Operation *vectorizeAffineForOp(AffineForOp forOp,
   // Register loop-related replacements:
   //   1) The new vectorized loop is registered as vector replacement of the
   //      scalar loop.
-  //      TODO: Support reductions along the vector dimension.
   //   2) The new iv of the vectorized loop is registered as scalar replacement
   //      since a scalar copy of the iv will prevail in the vectorized loop.
   //      TODO: A vector replacement will also be added in the future when
   //      vectorization of linear ops is supported.
   //   3) The new 'iter_args' region arguments are registered as vector
   //      replacements since they have been vectorized.
+  //   4) If the loop performs a reduction along the vector dimension, a
+  //      `vector.reduction` or similar op is inserted for each resulting value
+  //      of the loop and its scalar value replaces the corresponding scalar
+  //      result of the loop.
   state.registerOpVectorReplacement(forOp, vecForOp);
   state.registerValueScalarReplacement(forOp.getInductionVar(),
                                        vecForOp.getInductionVar());
@@ -1090,12 +1328,37 @@ static Operation *vectorizeAffineForOp(AffineForOp forOp,
     state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
                                             std::get<1>(iterTuple));
 
+  if (isLoopVecDim) {
+    for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
+      // First, we reduce the vector returned from the loop into a scalar.
+      Value reducedRes =
+          getVectorReductionOp(reductions[i].kind, state.builder,
+                               vecForOp.getLoc(), vecForOp.getResult(i));
+      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
+                        << reducedRes);
+      // Then we combine it with the original (scalar) initial value unless it
+      // is equal to the neutral element of the reduction.
+      Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
+      Value finalRes = reducedRes;
+      if (!isNeutralElementConst(reductions[i].kind, origInit, state))
+        finalRes = getReductionOp(reductions[i].kind, state.builder,
+                                  reducedRes.getLoc(), reducedRes, origInit);
+      state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
+    }
+  }
+
   if (isLoopVecDim)
     state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
 
   // Change insertion point so that upcoming vectorized instructions are
   // inserted into the vectorized loop's body.
   state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+  // If this is a reduction loop then we may need to create a mask to filter out
+  // garbage in the last iteration.
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0)
+    createMask(vecForOp, state);
+
   return vecForOp;
 }
 
@@ -1133,11 +1396,35 @@ static Operation *widenOp(Operation *op, VectorizationState &state) {
 
 /// Vectorizes a yield operation by widening its types. The builder's insertion
 /// point is set after the vectorized parent op to continue vectorizing the
-/// operations after the parent op.
+/// operations after the parent op. When vectorizing a reduction loop a mask may
+/// be used to prevent adding garbage values to the accumulator.
 static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
                                          VectorizationState &state) {
   Operation *newYieldOp = widenOp(yieldOp, state);
   Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
+
+  // If there is a mask for this loop then we must prevent garbage values from
+  // being added to the accumulator by inserting `select` operations, for
+  // example:
+  //
+  //   %res = addf %acc, %val : vector<128xf32>
+  //   %res_masked = select %mask, %res, %acc : vector<128xi1>, vector<128xf32>
+  //   affine.yield %res_masked : vector<128xf32>
+  //
+  if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
+    state.builder.setInsertionPoint(newYieldOp);
+    for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
+      Value result = newYieldOp->getOperand(i);
+      Value iterArg = cast<AffineForOp>(newParentOp).getRegionIterArgs()[i];
+      Value maskedResult = state.builder.create<SelectOp>(result.getLoc(), mask,
+                                                          result, iterArg);
+      LLVM_DEBUG(
+          dbgs() << "\n[early-vect]+++++ masking a yielded vector value: "
+                 << maskedResult);
+      newYieldOp->setOperand(i, maskedResult);
+    }
+  }
+
   state.builder.setInsertionPointAfter(newParentOp);
   return newYieldOp;
 }
@@ -1241,8 +1528,12 @@ vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
   auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
     LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
     Operation *vectorOp = vectorizeOneOperation(op, state);
-    if (!vectorOp)
+    if (!vectorOp) {
+      LLVM_DEBUG(
+          dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
+                 << *op << "\n");
       return WalkResult::interrupt();
+    }
 
     return WalkResult::advance();
   });
@@ -1258,6 +1549,11 @@ vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
     return failure();
   }
 
+  // Replace results of reduction loops with the scalar values computed using
+  // `vector.reduce` or similar ops.
+  for (auto resPair : state.loopResultScalarReplacement)
+    resPair.first.replaceAllUsesWith(resPair.second);
+
   assert(state.opVectorReplacement.count(rootLoop) == 1 &&
          "Expected vector replacement for loop nest");
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
@@ -1325,10 +1621,15 @@ static void computeIntersectionBuckets(
 /// vectorization factors in 'vectorSizes'. By default, each vectorization
 /// factor is applied inner-to-outer to the loops of each loop nest.
 /// 'fastestVaryingPattern' can be optionally used to provide a 
diff erent loop
-/// vectorization order.
+/// vectorization order. `reductionLoops` can be provided to specify loops which
+/// can be vectorized along the reduction dimension.
 static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
                            ArrayRef<int64_t> vectorSizes,
-                           ArrayRef<int64_t> fastestVaryingPattern) {
+                           ArrayRef<int64_t> fastestVaryingPattern,
+                           const ReductionLoopMap &reductionLoops) {
+  assert((reductionLoops.empty() || vectorSizes.size() == 1) &&
+         "Vectorizing reductions is supported only for 1-D vectors");
+
   // Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
   Optional<NestedPattern> pattern =
       makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
@@ -1359,6 +1660,7 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
       VectorizationStrategy strategy;
       // TODO: depending on profitability, elect to reduce the vector size.
       strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
+      strategy.reductionLoops = reductionLoops;
       if (failed(analyzeProfitability(match.getMatchedChildren(), 1,
                                       patternDepth, &strategy))) {
         continue;
@@ -1396,15 +1698,37 @@ void Vectorize::runOnFunction() {
     return signalPassFailure();
   }
 
+  if (vectorizeReductions && vectorSizes.size() != 1) {
+    f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+    return signalPassFailure();
+  }
+
   DenseSet<Operation *> parallelLoops;
-  f.walk([&parallelLoops](AffineForOp loop) {
-    if (isLoopParallel(loop))
-      parallelLoops.insert(loop);
-  });
+  ReductionLoopMap reductionLoops;
+
+  // If 'vectorize-reduction=true' is provided, we also populate the
+  // `reductionLoops` map.
+  if (vectorizeReductions) {
+    f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
+      SmallVector<LoopReduction, 2> reductions;
+      if (isLoopParallel(loop, &reductions)) {
+        parallelLoops.insert(loop);
+        // If it's not a reduction loop, adding it to the map is not necessary.
+        if (!reductions.empty())
+          reductionLoops[loop] = reductions;
+      }
+    });
+  } else {
+    f.walk([&parallelLoops](AffineForOp loop) {
+      if (isLoopParallel(loop))
+        parallelLoops.insert(loop);
+    });
+  }
 
   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
   NestedPatternContext mlContext;
-  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+                 reductionLoops);
 }
 
 /// Verify that affine loops in 'loops' meet the nesting criteria expected by
@@ -1452,12 +1776,17 @@ namespace mlir {
 /// factor is applied inner-to-outer to the loops of each loop nest.
 /// 'fastestVaryingPattern' can be optionally used to provide a 
diff erent loop
 /// vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
 void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
                           ArrayRef<int64_t> vectorSizes,
-                          ArrayRef<int64_t> fastestVaryingPattern) {
+                          ArrayRef<int64_t> fastestVaryingPattern,
+                          const ReductionLoopMap &reductionLoops) {
   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
   NestedPatternContext mlContext;
-  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
+                 reductionLoops);
 }
 
 /// External utility to vectorize affine loops from a single loop nest using an

diff  --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index 04464af904c09..32a9ff93c30a1 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -408,6 +408,53 @@ static LogicalResult verify(AtomicRMWOp op) {
   return success();
 }
 
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute mlir::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+                                     OpBuilder &builder, Location loc) {
+  switch (kind) {
+  case AtomicRMWKind::addf:
+  case AtomicRMWKind::addi:
+    return builder.getZeroAttr(resultType);
+  case AtomicRMWKind::muli:
+    return builder.getIntegerAttr(resultType, 1);
+  case AtomicRMWKind::mulf:
+    return builder.getFloatAttr(resultType, 1);
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value mlir::getIdentityValue(AtomicRMWKind op, Type resultType,
+                             OpBuilder &builder, Location loc) {
+  Attribute attr = getIdentityValueAttr(op, resultType, builder, loc);
+  return builder.create<ConstantOp>(loc, attr);
+}
+
+/// Return the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value mlir::getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                           Value lhs, Value rhs) {
+  switch (op) {
+  case AtomicRMWKind::addf:
+    return builder.create<AddFOp>(loc, lhs, rhs);
+  case AtomicRMWKind::addi:
+    return builder.create<AddIOp>(loc, lhs, rhs);
+  case AtomicRMWKind::mulf:
+    return builder.create<MulFOp>(loc, lhs, rhs);
+  case AtomicRMWKind::muli:
+    return builder.create<MulIOp>(loc, lhs, rhs);
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // GenericAtomicRMWOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
index 5190482f576b9..0088a3e6c9a49 100644
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -333,6 +333,28 @@ static void print(OpAsmPrinter &p, ReductionOp op) {
   p << " : " << op.vector().getType() << " into " << op.dest().getType();
 }
 
+Value mlir::vector::getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder,
+                                         Location loc, Value vector) {
+  Type scalarType = vector.getType().cast<ShapedType>().getElementType();
+  switch (op) {
+  case AtomicRMWKind::addf:
+  case AtomicRMWKind::addi:
+    return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+                                               builder.getStringAttr("add"),
+                                               vector, ValueRange{});
+  case AtomicRMWKind::mulf:
+  case AtomicRMWKind::muli:
+    return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+                                               builder.getStringAttr("mul"),
+                                               vector, ValueRange{});
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ContractionOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
index 09ee6d5136110..47e00f1faf4a5 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -590,3 +590,24 @@ func @vec_no_vecdim_last_value(%in: memref<128x256xf32>, %out: memref<256xf32>)
 // CHECK:         }
 // CHECK:         vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
 // CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is not vectorized if we do not request
+// reduction vectorization.
+
+func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vec_vecdim_reduction_rejected
+// CHECK-NOT: vector

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
new file mode 100644
index 0000000000000..0670c91b205f9
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -0,0 +1,468 @@
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file | FileCheck %s
+
+// The inner reduction loop '%j' is vectorized.
+
+func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
+// 
diff erent than in the previous test case).
+
+func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %ld, %red_iter : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_comm
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[ld]], %[[red_iter]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. Transforming the input before
+// performing the accumulation doesn't cause any problem.
+
+func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %sin = math.sin %ld : f32
+     %exp = math.exp %sin : f32
+     %add = addf %red_iter, %exp : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_expsin
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[sin:.*]] = math.sin %[[ld]]
+// CHECK:           %[[exp:.*]] = math.exp %[[sin]]
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[exp]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// Two reductions at the same time. The inner reduction loop '%j' is vectorized.
+
+func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   // Note that we pass the same constant '1.0' as initial values for both
+   // reductions.
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK:       %[[cst:.*]] = constant 1.000000e+00 : f32
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vone:.*]] = constant dense<1.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[part_sum]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[mul:.*]] = mulf %[[part_prod]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[nonfinal_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xf32> into f32
+// Note that to compute the final sum we need to add the original initial value
+// (%cst) since it is not zero.
+// CHECK:         %[[final_sum:.*]] = addf %[[nonfinal_sum]], %[[cst]] : f32
+// For the final product we don't need to do this additional step because the
+// initial value equals to 1 (the neutral element for multiplication).
+// CHECK:         %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The integer case.
+
+func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
+ %cst0 = constant 0 : i64
+ %cst1 = constant 1 : i64
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xi64>
+     %add = addi %part_sum, %ld : i64
+     %mul = muli %part_prod, %ld : i64
+     affine.yield %add, %mul : i64, i64
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xi64>
+   affine.store %prod, %out_prod[%i] : memref<256xi64>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0> : vector<128xi64>
+// CHECK:         %[[vone:.*]] = constant dense<1> : vector<128xi64>
+// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi64>, vector<128xi64>
+// CHECK:           %[[add:.*]] = addi %[[part_sum]], %[[ld]] : vector<128xi64>
+// CHECK:           %[[mul:.*]] = muli %[[part_prod]], %[[ld]] : vector<128xi64>
+// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xi64> into i64
+// CHECK:         %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xi64> into i64
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
+// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
+// CHECK:       }
+
+// -----
+
+// The outer reduction loop '%j' is vectorized.
+
+func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ %outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
+   %inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %inner_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   %outer_add = addf %outer_iter, %inner_red : f32
+   affine.yield %outer_add : f32
+ }
+ affine.store %outer_red, %out[0] : memref<1xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nested
+// CHECK:       %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:       %[[outer_red:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[inner_red:.*]] = affine.for %{{.*}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[inner_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[outer_add:.*]] = addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
+// CHECK:         affine.yield %[[outer_add]] : vector<128xf32>
+// CHECK:       }
+// CHECK:       %[[final_sum:.*]] = vector.reduction "add", %[[outer_red:.*]] : vector<128xf32> into f32
+// CHECK:       affine.store %[[final_sum]], %{{.*}} : memref<1xf32>
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_1_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_2_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' performs an unknown reduction operation and is
+// not vectorized.
+
+func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+   %add = addf %red_iter, %red_iter : f32
+   affine.yield %add : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_unknown_reduction_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' doesn't perform any operation which is not
+// recognized as a standard reduction.
+
+func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+   affine.yield %red_iter : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_none_reduction_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The number of iterations is not divisable by the vector size, so a mask has
+// to be applied to the last update of the accumulator.
+
+func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map0:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_reduction_masked
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The number of iteration is not known, so a mask has to be applied.
+
+func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map1:.*]] = affine_map<([[d0:.*]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to %[[bnd:.*]] step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The lower bound is nonzero, but the number of iterations is divisible by the
+// vector size, so masking is not needed.
+
+func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nonzero_lb
+// CHECK:         %{{.*}} = affine.for %{{.*}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
+// CHECK-NOT:     vector.create_mask
+
+// -----
+
+// The lower bound is unknown, so we need to create a mask.
+
+func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map2:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 512)>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = %[[lb:.*]] to 512 step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The upper bound is a minimum expression.
+
+func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)>
+// CHECK:       #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
+// CHECK-LABEL: @vecdim_reduction_complex_ub
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK:           %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The same mask is applied to both reductions.
+
+func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %exp = math.exp %ld : f32
+     %add = addf %sum_iter, %ld : f32
+     %eadd = addf %expsum_iter, %exp : f32
+     affine.yield %add, %eadd : f32, f32
+   }
+   affine.store %final_sum, %out[2*%i] : memref<512xf32>
+   affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map4:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_two_reductions_masked
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[sum_iter:.*]] = {{.*}}, %[[esum_iter:.*]] = {{.*}}) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[sum_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[eadd:.*]] = addf %[[esum_iter]], %[[exp]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[sum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32>
+// CHECK:         }

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir
new file mode 100644
index 0000000000000..e24c254e67626
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0 vectorize-reductions=true" -verify-diagnostics
+
+// TODO: Vectorization of reduction loops along the reduction dimension is not
+//       supported for higher-rank vectors yet, so we are just checking that an
+//       error message is produced.
+
+// expected-error at +1 {{Vectorizing reductions is supported only for 1-D vectors}}
+func @vecdim_reduction_2d(%in: memref<256x512x1024xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum_j = affine.for %j = 0 to 512 iter_args(%red_iter_j = %cst) -> (f32) {
+     %sum_k = affine.for %k = 0 to 1024 iter_args(%red_iter_k = %cst) -> (f32) {
+       %ld = affine.load %in[%i, %j, %k] : memref<256x512x1024xf32>
+       %add = addf %red_iter_k, %ld : f32
+       affine.yield %add : f32
+     }
+     %add = addf %red_iter_j, %sum_k : f32
+     affine.yield %add : f32
+   }
+   affine.store %sum_j, %out[%i] : memref<256xf32>
+ }
+ return
+}
+