[Mlir-commits] [mlir] [mlir][Linalg] Refine how broadcast dims are treated (PR #99015)

Thu Jul 25 13:14:59 PDT 2024

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/99015

>From 54676c303f2e3f754c42c10b85c896ba1478f4f0 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 12 Jul 2024 16:52:55 +0000
Subject: [PATCH 1/3] [mlir][Linalg] Refine how broadcast dims are treated

This PR fixes how broadcast dims (identified as "zero" results in
permutation maps) corresponding to a reduction iterator are vectorised
in the case of generic Ops. Here's an example:

```mlir
  #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
  #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>

  func.func @generic_with_reduction_and_broadcast(%arg0: tensor<1x12x197x197xf32>) -> (tensor<1x12x197x1xf32>) {
    %0 = tensor.empty() : tensor<1x12x197x1xf32>

    %1 = linalg.generic {indexing_maps = [#map, #map1],
                        iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
      ins(%arg0 : tensor<1x12x197x197xf32>)
      outs(%0 : tensor<1x12x197x1xf32>) {

    ^bb0(%in: f32, %out: f32):
      %818 = arith.addf %in, %out : f32
      linalg.yield %818 : f32
    } -> tensor<1x12x197x1xf32>
    return %1 : tensor<1x12x197x1xf32>
  }
```

This is a perfectly valid Generic Op, but currently triggers two issues
in the vectoriser. The root cause is this map:

```mlir
  #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>
```

This map triggers an assert in `reindexIndexingMap` -  this hook
incorrectly assumes that every result in the input map is a `dim`
expression and that there are no constants. That's not the case in this
example. `reindexIndexingMap` is extended to allow maps like the one
above. For now, only constant "zero" results are allowed. This can be
extended in the future once a good motivating example is available.

Separately, the permutation map highlighted above "breaks" mask
calculation (ATM masks are always computed, even in the presence of
static shapes). When applying the following permutation:
```mlir
  (d0, d1, d2, d3) -> (d0, d1, d2, 0)
```

to these canonical shapes (corresponding to the example above):
```
  (1, 12, 197, 197)
```
we end up with the following error:
```bash
error: vector types must have positive constant sizes but got 1, 12, 197, 0
```

The error makes sense and indicates that we should update the
permutation map above to:
```
  (d0, d1, d2, d3) -> (d0, d1, d2)
```

This would correctly give the following vector type:
```
  vector<1x12x197xi1>
```

Fixes #97247
---
 mlir/include/mlir/IR/AffineMap.h              |  4 ++
 .../Linalg/Transforms/Vectorization.cpp       | 18 +++++++-
 mlir/lib/IR/AffineMap.cpp                     | 23 ++++++++++
 .../Linalg/vectorization-with-patterns.mlir   | 40 +++++++++++++++++
 mlir/test/Dialect/Linalg/vectorization.mlir   | 45 +++++++++++++++++++
 5 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 676da6d1764970..035a9bcdfb3f84 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -354,6 +354,10 @@ class AffineMap {
   /// returns the resulting values. `this` must be symbol-less.
   SmallVector<int64_t, 4> compose(ArrayRef<int64_t> values) const;
 
+  size_t numOfZeroResults() const;
+
+  AffineMap dropZeros();
+
   /// Returns true if the AffineMap represents a subset (i.e. a projection) of a
   /// symbol-less permutation map. `allowZeroInResults` allows projected
   /// permutation maps with constant zero result expressions.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7f7168eb86832d..e44c0ffec2bc93 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -476,7 +476,7 @@ static AffineMap reindexIndexingMap(AffineMap map) {
   assert(map.isProjectedPermutation(/*allowZeroInResults=*/true) &&
          "expected projected permutation");
   auto res = compressUnusedDims(map);
-  assert(res.getNumDims() == res.getNumResults() &&
+  assert(res.getNumDims() == (res.getNumResults() - res.numOfZeroResults()) &&
          "expected reindexed map with same number of dims and results");
   return res;
 }
@@ -629,7 +629,21 @@ static Value buildVectorWrite(RewriterBase &rewriter, Value value,
         loc, value, outputOperand->get(), ValueRange{});
   }
 
-  write = state.maskOperation(rewriter, write, linalgOp, opOperandMap);
+  // The operand map may contain "zero" results, e.g.:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2, 0)
+  // When applied to canonical vector shapes like these:
+  //    (1, 16, 16, 4)
+  // we would get:
+  //    (1, 16, 16, 0)
+  // Instead, we should extract the following map:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2)
+  // This way, the corresponding vector/mask type will be:
+  //    vector<1x16x16xty>
+  // rather than:
+  //    vector<1x16x16x0xty>
+  auto opOperantMapWithoutZeros = opOperandMap.dropZeros();
+  write =
+      state.maskOperation(rewriter, write, linalgOp, opOperantMapWithoutZeros);
 
   // If masked, set in-bounds to true. Masking guarantees that the access will
   // be in-bounds.
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 859fb8ebc10e8c..110a8d603f917e 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -553,6 +553,18 @@ AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const {
   return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
 }
 
+AffineMap AffineMap::dropZeros() {
+  auto exprs = llvm::to_vector<4>(getResults());
+  SmallVector<AffineExpr, 8> newExprs;
+
+  for (auto expr : getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
+    if (!constExpr)
+      newExprs.push_back(expr);
+  }
+  return AffineMap::get(getNumDims(), getNumSymbols(), newExprs, getContext());
+}
+
 AffineMap AffineMap::compose(AffineMap map) const {
   assert(getNumDims() == map.getNumResults() && "Number of results mismatch");
   // Prepare `map` by concatenating the symbols and rewriting its exprs.
@@ -592,6 +604,17 @@ SmallVector<int64_t, 4> AffineMap::compose(ArrayRef<int64_t> values) const {
   return res;
 }
 
+size_t AffineMap::numOfZeroResults() const {
+  size_t res = 0;
+  for (auto expr : getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
+    if (constExpr && constExpr.getValue() == 0)
+      res++;
+  }
+
+  return res;
+}
+
 bool AffineMap::isProjectedPermutation(bool allowZeroInResults) const {
   if (getNumSymbols() > 0)
     return false;
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index d7ff1ded9d9332..bf015ef409b81c 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1899,3 +1899,43 @@ module attributes {transform.with_named_sequence} {
 //       CHECK:     %[[VAL_8:.*]] = vector.transpose %[[VAL_7]], [1, 0] : vector<1x4xf32> to vector<4x1xf32>
 //       CHECK:     vector.transfer_write %[[VAL_8]], %{{.*}} {in_bounds = [true, true]} : vector<4x1xf32>, tensor<4x1xf32>
 //       CHECK:     vector.transfer_write %[[VAL_7]], %{{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+
+// -----
+
+// Extracted from: https://github.com/llvm/llvm-project/issues/97247
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>
+
+func.func @generic_with_reduction_and_broadcast(%arg0: tensor<1x12x197x197xf32>) -> (tensor<1x12x197x1xf32>) {
+  %0 = tensor.empty() : tensor<1x12x197x1xf32>
+  %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0 : tensor<1x12x197x197xf32>) outs(%0 : tensor<1x12x197x1xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %818 = arith.addf %in, %out : f32
+    linalg.yield %818 : f32
+  } -> tensor<1x12x197x1xf32>
+  return %1 : tensor<1x12x197x1xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK: #[[$ATTR_32:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// CHECK-LABEL:   func.func @generic_with_reduction_and_broadcast(
+// CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<1x12x197x197xf32>) -> tensor<1x12x197x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = tensor.empty() : tensor<1x12x197x1xf32>
+// CHECK:           %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true, true, true, true]} : tensor<1x12x197x197xf32>, vector<1x12x197x197xf32>
+// CHECK:           %[[VAL_5:.*]] = vector.transfer_read %[[VAL_3]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true, true, true], permutation_map = #[[$ATTR_32]]} : tensor<1x12x197x1xf32>, vector<1x12x197xf32>
+// CHECK:           %[[VAL_6:.*]] = vector.multi_reduction <add>, %[[VAL_4]], %[[VAL_5]] [3] : vector<1x12x197x197xf32> to vector<1x12x197xf32>
+// CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_6]] : vector<1x12x197xf32> to vector<1x1x12x197xf32>
+// CHECK:           %[[VAL_8:.*]] = vector.transpose %[[VAL_7]], [1, 2, 3, 0] : vector<1x1x12x197xf32> to vector<1x12x197x1xf32>
+// CHECK:           %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_3]]{{\[}}%[[VAL_2]], %[[VAL_2]], %[[VAL_2]], %[[VAL_2]]] {in_bounds = [true, true, true, true]} : vector<1x12x197x1xf32>, tensor<1x12x197x1xf32>
+// CHECK:           return %[[VAL_9]] : tensor<1x12x197x1xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 783149971f0d60..0e2b2458d29cdb 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -147,6 +147,51 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0, 0)>
+
+func.func @dynamic_generic_with_reduction_and_broadcast(%arg0: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+  %0 = linalg.generic { indexing_maps = [#map, #map1],
+                        iterator_types = ["parallel", "reduction"]}
+    ins(%arg0 : tensor<?x?xf32>)
+    outs(%init : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %1 = arith.addf %in, %out : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+
+// CHECK-LABEL:   func.func @dynamic_generic_with_reduction_and_broadcast(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xf32>
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]] : vector<4x4xi1>
+// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_7]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x4xf32> } : vector<4x4xi1> -> vector<4x4xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_11:.*]] = vector.create_mask %[[VAL_3]] : vector<4xi1>
+// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_10]] {in_bounds = [true], permutation_map = #[[$MAP]]} : tensor<?x?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.multi_reduction <add>, %[[VAL_9]], %[[VAL_12]] [1] : vector<4x4xf32> to vector<4xf32> } : vector<4x4xi1> -> vector<4xf32>
+// CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_11]] { vector.transfer_write %[[VAL_13]], %[[VAL_1]]{{\[}}%[[VAL_14]], %[[VAL_14]]] {in_bounds = [true], permutation_map = #[[$MAP]]} : vector<4xf32>, tensor<?x?xf32> } : vector<4xi1> -> tensor<?x?xf32>
+// CHECK:           return %[[VAL_15]] : tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 4] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @vectorize_dynamic_2d_transpose(%arg0: tensor<?x?xf32>,
                                           %arg1: tensor<?x?xf32>,
                                           %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {

>From 9c5c154d8d2ee3433ea7d8fc109804db6a36ac18 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Mon, 22 Jul 2024 07:41:50 +0000
Subject: [PATCH 2/3] fixup! [mlir][Linalg] Refine how broadcast dims are
 treated

Addressing PR comments
---
 mlir/include/mlir/IR/AffineMap.h              | 18 +++++++++++--
 .../Linalg/Transforms/Vectorization.cpp       |  5 ++--
 mlir/lib/IR/AffineMap.cpp                     | 26 +++++++++----------
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 035a9bcdfb3f84..e30950bbf292d6 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -354,9 +354,23 @@ class AffineMap {
   /// returns the resulting values. `this` must be symbol-less.
   SmallVector<int64_t, 4> compose(ArrayRef<int64_t> values) const;
 
-  size_t numOfZeroResults() const;
+  /// Returns the number of "zero" results (constant values == 0) in this map.
+  ///
+  /// Example:
+  ///   * For `(d0, d1) -> (d0, d1, 0)` returns 1
+  ///   * For `(d0, d1, d2) -> (d0, d1)` returns 0
+  ///   * For `(d0, d1, d2) -> (d0, 0, d1, 0, d2)` returns 2
+  size_t getNumOfZeroResults() const;
 
-  AffineMap dropZeros();
+  /// Returns the AffineMap resulting from removing "zero" results (constant
+  /// values == 0) from this map.
+  ///
+  /// Example:
+  ///   * For `(d0, d1) -> (d0, d1, 0)` returns `(d0, d1) -> (d0, d1)`
+  ///   * For `(d0, d1, d2) -> (d0, d1)` returns `(d0, d1, d2) -> (d0, d1)`
+  ///   * For `(d0, d1, d2) -> (d0, 0, d1, 0, d2)` returns
+  ///     `(d0, d1, d2) -> (d0, d1, d2)`
+  AffineMap dropZeroResults();
 
   /// Returns true if the AffineMap represents a subset (i.e. a projection) of a
   /// symbol-less permutation map. `allowZeroInResults` allows projected
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e44c0ffec2bc93..655623344a5f8d 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -476,7 +476,8 @@ static AffineMap reindexIndexingMap(AffineMap map) {
   assert(map.isProjectedPermutation(/*allowZeroInResults=*/true) &&
          "expected projected permutation");
   auto res = compressUnusedDims(map);
-  assert(res.getNumDims() == (res.getNumResults() - res.numOfZeroResults()) &&
+  assert(res.getNumDims() ==
+             (res.getNumResults() - res.getNumOfZeroResults()) &&
          "expected reindexed map with same number of dims and results");
   return res;
 }
@@ -641,7 +642,7 @@ static Value buildVectorWrite(RewriterBase &rewriter, Value value,
   //    vector<1x16x16xty>
   // rather than:
   //    vector<1x16x16x0xty>
-  auto opOperantMapWithoutZeros = opOperandMap.dropZeros();
+  AffineMap opOperantMapWithoutZeros = opOperandMap.dropZeroResults();
   write =
       state.maskOperation(rewriter, write, linalgOp, opOperantMapWithoutZeros);
 
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 110a8d603f917e..59f6e723dbd974 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -553,18 +553,6 @@ AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const {
   return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
 }
 
-AffineMap AffineMap::dropZeros() {
-  auto exprs = llvm::to_vector<4>(getResults());
-  SmallVector<AffineExpr, 8> newExprs;
-
-  for (auto expr : getResults()) {
-    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
-    if (!constExpr)
-      newExprs.push_back(expr);
-  }
-  return AffineMap::get(getNumDims(), getNumSymbols(), newExprs, getContext());
-}
-
 AffineMap AffineMap::compose(AffineMap map) const {
   assert(getNumDims() == map.getNumResults() && "Number of results mismatch");
   // Prepare `map` by concatenating the symbols and rewriting its exprs.
@@ -604,7 +592,7 @@ SmallVector<int64_t, 4> AffineMap::compose(ArrayRef<int64_t> values) const {
   return res;
 }
 
-size_t AffineMap::numOfZeroResults() const {
+size_t AffineMap::getNumOfZeroResults() const {
   size_t res = 0;
   for (auto expr : getResults()) {
     auto constExpr = dyn_cast<AffineConstantExpr>(expr);
@@ -615,6 +603,18 @@ size_t AffineMap::numOfZeroResults() const {
   return res;
 }
 
+AffineMap AffineMap::dropZeroResults() {
+  auto exprs = llvm::to_vector(getResults());
+  SmallVector<AffineExpr> newExprs;
+
+  for (auto expr : getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(expr);
+    if (!constExpr || constExpr.getValue() != 0)
+      newExprs.push_back(expr);
+  }
+  return AffineMap::get(getNumDims(), getNumSymbols(), newExprs, getContext());
+}
+
 bool AffineMap::isProjectedPermutation(bool allowZeroInResults) const {
   if (getNumSymbols() > 0)
     return false;

>From 8dbb4ce7d3a8e40df0cb52f4e069e7df26bbbe78 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 25 Jul 2024 20:13:06 +0000
Subject: [PATCH 3/3] fixup! [mlir][Linalg] Refine how broadcast dims are
 treated

* Move the logic to remove zero from indexing maps to `maskOperation`
* Update the input mask name in `maskOperation` to `maybeIndexingMap` -
  the actual input is always an indexing map extracted from the
  corresponding linalg Op
* Remove the duplicated comment for `maskOperation`
---
 .../Linalg/Transforms/Vectorization.cpp       | 53 ++++++++-----------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 655623344a5f8d..7a1bd689a54ae8 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -224,10 +224,10 @@ struct VectorizationState {
   /// Masks an operation with the canonical vector mask if the operation needs
   /// masking. Returns the masked operation or the original operation if masking
   /// is not needed. If provided, the canonical mask for this operation is
-  /// permuted using `maybeMaskingMap`.
+  /// permuted using `maybeIndexingMap`.
   Operation *
   maskOperation(RewriterBase &rewriter, Operation *opToMask, LinalgOp linalgOp,
-                std::optional<AffineMap> maybeMaskingMap = std::nullopt);
+                std::optional<AffineMap> maybeIndexingMap = std::nullopt);
 
 private:
   /// Initializes the iteration space static sizes using the Linalg op
@@ -422,16 +422,28 @@ Value VectorizationState::getOrCreateMaskFor(
   return mask;
 }
 
-/// Masks an operation with the canonical vector mask if the operation needs
-/// masking. Returns the masked operation or the original operation if masking
-/// is not needed. If provided, the canonical mask for this operation is
-/// permuted using `maybeMaskingMap`.
 Operation *
 VectorizationState::maskOperation(RewriterBase &rewriter, Operation *opToMask,
                                   LinalgOp linalgOp,
-                                  std::optional<AffineMap> maybeMaskingMap) {
+                                  std::optional<AffineMap> maybeIndexingMap) {
   LDBG("Trying to mask: " << *opToMask << "\n");
 
+  std::optional<AffineMap> maybeMaskingMap = std::nullopt;
+  // The Operand indexing map may contain "zero" results, e.g.:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2, 0)
+  // When applied to canonical vector shapes like these:
+  //    (1, 16, 16, 4)
+  // we would get:
+  //    (1, 16, 16, 0)
+  // Instead, we should extract the following map permutation map for masking:
+  //    (d0, d1, d2, d3) -> (d0, d1, d2)
+  // This way, the corresponding vector/mask type will be:
+  //    vector<1x16x16xty>
+  // rather than:
+  //    vector<1x16x16x0xty>
+  if (maybeIndexingMap)
+    maybeMaskingMap = maybeIndexingMap->dropZeroResults();
+
   // Create or retrieve mask for this operation.
   Value mask =
       getOrCreateMaskFor(rewriter, opToMask, linalgOp, maybeMaskingMap);
@@ -630,21 +642,8 @@ static Value buildVectorWrite(RewriterBase &rewriter, Value value,
         loc, value, outputOperand->get(), ValueRange{});
   }
 
-  // The operand map may contain "zero" results, e.g.:
-  //    (d0, d1, d2, d3) -> (d0, d1, d2, 0)
-  // When applied to canonical vector shapes like these:
-  //    (1, 16, 16, 4)
-  // we would get:
-  //    (1, 16, 16, 0)
-  // Instead, we should extract the following map:
-  //    (d0, d1, d2, d3) -> (d0, d1, d2)
-  // This way, the corresponding vector/mask type will be:
-  //    vector<1x16x16xty>
-  // rather than:
-  //    vector<1x16x16x0xty>
-  AffineMap opOperantMapWithoutZeros = opOperandMap.dropZeroResults();
   write =
-      state.maskOperation(rewriter, write, linalgOp, opOperantMapWithoutZeros);
+      state.maskOperation(rewriter, write, linalgOp, opOperandMap);
 
   // If masked, set in-bounds to true. Masking guarantees that the access will
   // be in-bounds.
@@ -1330,16 +1329,6 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
     // permutation map and masking map.
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(opOperand);
 
-    // Remove zeros from indexing map to use it as masking map.
-    SmallVector<int64_t> zeroPos;
-    auto results = indexingMap.getResults();
-    for (const auto &result : llvm::enumerate(results)) {
-      if (isa<AffineConstantExpr>(result.value())) {
-        zeroPos.push_back(result.index());
-      }
-    }
-    AffineMap maskingMap = indexingMap.dropResults(zeroPos);
-
     AffineMap readMap;
     VectorType readType;
     Type elemType = getElementTypeOrSelf(opOperand->get());
@@ -1369,7 +1358,7 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
     Operation *read = rewriter.create<vector::TransferReadOp>(
         loc, readType, opOperand->get(), indices, readMap,
         ArrayRef<bool>(inBounds));
-    read = state.maskOperation(rewriter, read, linalgOp, maskingMap);
+    read = state.maskOperation(rewriter, read, linalgOp, indexingMap);
     Value readValue = read->getResult(0);
 
     // 3.b. If masked, set in-bounds to true. Masking guarantees that the access