[Mlir-commits] [mlir] [mlir][Vector] Fix mask lowering in_bounds and all-true mask elimination (PR #189477)

Mon Mar 30 13:53:06 PDT 2026

https://github.com/bjacob updated https://github.com/llvm/llvm-project/pull/189477

>From 578f8ea72e3a42e994ea77699b52a9c2b7a48416 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoit.jacob at amd.com>
Date: Mon, 30 Mar 2026 17:12:24 +0000
Subject: [PATCH] [mlir][Vector] Fix mask lowering in_bounds and all-true mask
 elimination

When lowering vector.mask around transfer read/write, set in_bounds to
false on the new transfer instead of preserving the region body's attribute.
After the mask becomes a transfer operand, keeping in_bounds=true was
misleading for inactive lanes (padding / OOB semantics) and could combine
unsoundly with foldTransferFullMask + in_bounds.
When folding create_mask to all-true for unknown dimensions, require lower
and upper scalable bounds to agree before treating the dimension as
constant-sized. Using only a lower bound was unsound when the extent can
vary at runtime (e.g. dynamic slice full on most iterations but partial on
the last).

Signed-off-by: Benoit Jacob <benoit.jacob at amd.com>
---
 .../Vector/Transforms/LowerVectorMask.cpp     | 20 +++++++++--
 .../Transforms/VectorMaskElimination.cpp      | 36 +++++++++++++++----
 ...compose-masked-vectorize-and-cleanups.mlir |  8 ++---
 mlir/test/Dialect/Vector/eliminate-masks.mlir | 20 +++++++++++
 .../Dialect/Vector/lower-vector-mask.mlir     | 30 +++++++++++++++-
 .../vector-mask-lowering-transforms.mlir      |  2 +-
 6 files changed, 101 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index 7730c4e7c950a..818d95000ae42 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -221,11 +221,22 @@ struct MaskedTransferReadOpPattern
       return rewriter.notifyMatchFailure(
           maskingOp, "Can't lower passthru to vector.transfer_read");
 
+    // The nested transfer often has in_bounds=true because vectorization set
+    // it when the read lived under vector.mask. After peeling the mask onto
+    // the transfer_read's mask operand, keeping in_bounds=true is misleading:
+    // masked-out lanes rely on padding for out-of-bounds semantics, and
+    // foldTransferFullMask + in_bounds can collapse to an unmasked read that
+    // assumes a full in-bounds super-vector load. Use out-of-bounds defaults
+    // here; canonicalization can still promote dims later when provably safe
+    // after the mask is folded away.
+    auto inBoundsAttr = rewriter.getBoolArrayAttr(
+        SmallVector<bool>(readOp.getVectorType().getRank(), false));
+
     // Replace the `vector.mask` operation.
     rewriter.replaceOpWithNewOp<TransferReadOp>(
         maskingOp.getOperation(), readOp.getVectorType(), readOp.getBase(),
         readOp.getIndices(), readOp.getPermutationMap(), readOp.getPadding(),
-        maskingOp.getMask(), readOp.getInBounds());
+        maskingOp.getMask(), inBoundsAttr);
     return success();
   }
 };
@@ -243,11 +254,16 @@ struct MaskedTransferWriteOpPattern
     Type resultType =
         writeOp.getResult() ? writeOp.getResult().getType() : Type();
 
+    // See MaskedTransferReadOpPattern: do not preserve in_bounds from the
+    // region body once the mask becomes a transfer operand.
+    auto inBoundsAttr = rewriter.getBoolArrayAttr(
+        SmallVector<bool>(writeOp.getVectorType().getRank(), false));
+
     // Replace the `vector.mask` operation.
     rewriter.replaceOpWithNewOp<TransferWriteOp>(
         maskingOp.getOperation(), resultType, writeOp.getVector(),
         writeOp.getBase(), writeOp.getIndices(), writeOp.getPermutationMap(),
-        maskingOp.getMask(), writeOp.getInBounds());
+        maskingOp.getMask(), inBoundsAttr);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
index 6f75ce7a04511..4194cdfa77b5f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -52,28 +52,50 @@ LogicalResult resolveAllTrueCreateMaskOp(IRRewriter &rewriter,
   }
 
   for (auto [i, dimSize] : unknownDims) {
-    // Compute the lower bound for the unknown dimension (i.e. the smallest
-    // value it could be).
+    // Compute lower and upper bounds for the unknown dimension. We need both
+    // to agree (same constant or same scalable expression) before treating the
+    // mask as all-true: using only a lower bound is unsound when the value can
+    // vary at runtime (e.g. tensor.dim of a dynamic slice that is full-sized
+    // on most iterations but partial on the last). The lower bound analysis
+    // may then report the full size even though the upper bound analysis (or
+    // the differing tight range) shows the dimension is not a single constant.
     FailureOr<ConstantOrScalableBound> dimLowerBound =
         vector::ScalableValueBoundsConstraintSet::computeScalableBound(
             dimSize, {}, vscaleRange.vscaleMin, vscaleRange.vscaleMax,
             presburger::BoundType::LB);
     if (failed(dimLowerBound))
       return failure();
-    auto dimLowerBoundSize = dimLowerBound->getSize();
+    FailureOr<ConstantOrScalableBound::BoundSize> dimLowerBoundSize =
+        dimLowerBound->getSize();
     if (failed(dimLowerBoundSize))
       return failure();
+
+    FailureOr<ConstantOrScalableBound> dimUpperBound =
+        vector::ScalableValueBoundsConstraintSet::computeScalableBound(
+            dimSize, {}, vscaleRange.vscaleMin, vscaleRange.vscaleMax,
+            presburger::BoundType::UB);
+    if (failed(dimUpperBound))
+      return failure();
+    FailureOr<ConstantOrScalableBound::BoundSize> dimUpperBoundSize =
+        dimUpperBound->getSize();
+    if (failed(dimUpperBoundSize))
+      return failure();
+
+    if (dimLowerBoundSize->scalable != dimUpperBoundSize->scalable ||
+        dimLowerBoundSize->baseSize != dimUpperBoundSize->baseSize)
+      return failure();
+
     if (dimLowerBoundSize->scalable) {
-      // 1. The lower bound, LB, is scalable. If LB is < the mask dim size then
-      // this dim is not all-true.
+      // 1. The bound is scalable. If it is < the mask dim size then this dim
+      // is not all-true.
       if (dimLowerBoundSize->baseSize < maskTypeDimSizes[i])
         return failure();
     } else {
-      // 2. The lower bound, LB, is a constant.
+      // 2. The bound is a constant.
       // - If the mask dim size is scalable then this dim is not all-true.
       if (maskTypeDimScalableFlags[i])
         return failure();
-      // - If LB < the _fixed-size_ mask dim size then this dim is not all-true.
+      // - If the constant < the _fixed-size_ mask dim size then not all-true.
       if (dimLowerBoundSize->baseSize < maskTypeDimSizes[i])
         return failure();
     }
diff --git a/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir b/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
index 61fe3da34e1d5..0669437e9aaa3 100644
--- a/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
@@ -4,16 +4,16 @@
 func.func @masked_matmul(%module: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
 
   //      CHECK: %[[MLHS:.*]] = vector.create_mask {{.*}} : vector<8x8xi1>
-  //      CHECK: %[[LHS:.*]] = vector.transfer_read %{{.*}}, %[[MLHS]] {in_bounds = [true, true]} : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
+  //      CHECK: %[[LHS:.*]] = vector.transfer_read %{{.*}}, %[[MLHS]] : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
   //      CHECK: %[[MRHS:.*]] = vector.create_mask {{.*}} : vector<8x8xi1>
-  //      CHECK: %[[RHS:.*]] = vector.transfer_read %{{.*}}, %[[MRHS]] {in_bounds = [true, true]} : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
+  //      CHECK: %[[RHS:.*]] = vector.transfer_read %{{.*}}, %[[MRHS]] : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
   //      CHECK: %[[MACC:.*]] = vector.create_mask {{.*}} : vector<8x8xi1>
-  //      CHECK: %[[ACC:.*]] = vector.transfer_read {{.*}}, %[[MACC]] {in_bounds = [true, true]} : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
+  //      CHECK: %[[ACC:.*]] = vector.transfer_read {{.*}}, %[[MACC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>, vector<8x8xf32>
   //      CHECK: %[[MRES:.*]] = vector.create_mask {{.*}} : vector<8x8x8xi1>
   //      CHECK: %[[RES:.*]] = vector.mask %[[MRES]] { vector.contract
   // CHECK-SAME:   : vector<8x8xf32>, vector<8x8xf32> into vector<8x8xf32>
   // CHECK-SAME:   : vector<8x8x8xi1> -> vector<8x8xf32>
-  //      CHECK: vector.transfer_write %[[RES]], %{{.*}}, %[[MACC]] {in_bounds = [true, true]} : vector<8x8xf32>, memref<?x?xf32, strided<[?, 1], offset: ?>>
+  //      CHECK: vector.transfer_write %[[RES]], %{{.*}}, %[[MACC]] : vector<8x8xf32>, memref<?x?xf32, strided<[?, 1], offset: ?>>
   linalg.matmul ins(%module, %arg1 : memref<?x?xf32>, memref<?x?xf32>) outs(%arg2 : memref<?x?xf32>)
   return
 }
diff --git a/mlir/test/Dialect/Vector/eliminate-masks.mlir b/mlir/test/Dialect/Vector/eliminate-masks.mlir
index 88be7e529bb9e..fa03e1e90ee7e 100644
--- a/mlir/test/Dialect/Vector/eliminate-masks.mlir
+++ b/mlir/test/Dialect/Vector/eliminate-masks.mlir
@@ -169,3 +169,23 @@ func.func @negative_value_bounds_scalable_dim_not_all_true(%tensor: tensor<2x100
   "test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
   return
 }
+
+// -----
+
+// tensor.dim of a dynamic extent can match the vector lane count on some paths
+// but not others; LB-only analysis may incorrectly treat create_mask as
+// all-true. Require matching LB and UB before folding to constant_mask.
+//
+// CHECK-LABEL: @negative_dynamic_extent_dim_not_provably_constant
+// CHECK-NOT: vector.constant_mask
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<8x8xi1>) -> ()
+func.func @negative_dynamic_extent_dim_not_provably_constant(%t : tensor<8x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %dim = tensor.dim %t, %c1 : tensor<8x?xf32>
+  %mask = vector.create_mask %c8, %dim : vector<8x8xi1>
+  "test.some_use"(%mask) : (vector<8x8xi1>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/Vector/lower-vector-mask.mlir b/mlir/test/Dialect/Vector/lower-vector-mask.mlir
index a8a1164e2f762..ddcc961378c3c 100644
--- a/mlir/test/Dialect/Vector/lower-vector-mask.mlir
+++ b/mlir/test/Dialect/Vector/lower-vector-mask.mlir
@@ -75,7 +75,35 @@ func.func @vector_gather(%arg0: tensor<64xf32>, %arg1: tensor<3xf32>) -> tensor<
 // CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 // CHECK:           %[[VAL_6:.*]] = vector.create_mask %[[VAL_5]] : vector<4xi1>
 // CHECK:           %[[VAL_7:.*]] = vector.gather %[[VAL_0]][%[[VAL_4]]] [%[[VAL_3]]], %[[VAL_6]], %[[VAL_2]] : tensor<64xf32>, vector<4xindex>, vector<4xi1>, vector<4xf32> into vector<4xf32>
-// CHECK:           %[[VAL_8:.*]] = vector.transfer_write %[[VAL_7]], %[[VAL_1]][%[[VAL_4]]], %[[VAL_6]] {in_bounds = [true]} : vector<4xf32>, tensor<3xf32>
+// CHECK:           %[[VAL_8:.*]] = vector.transfer_write %[[VAL_7]], %[[VAL_1]][%[[VAL_4]]], %[[VAL_6]] : vector<4xf32>, tensor<3xf32>
+
+// -----
+
+// Like linalg vectorization of a reduction tile: `tensor<8x?xf32>` with a
+// `vector.create_mask` over the dynamic minor dim and `vector.mask` around a
+// 2-D `vector.transfer_read` (same shape as the IREE 1600x625 / 8-wide tail repro).
+//
+// CHECK-LABEL: func.func @masked_2d_transfer_read_dynamic_minor_dim
+// CHECK-SAME:      %[[TILE:.*]]: tensor<8x?xf32>) -> vector<8x8xf32>
+// CHECK-NOT:       vector.mask
+// CHECK:           %[[DIM:.*]] = tensor.dim %[[TILE]], %{{.*}} : tensor<8x?xf32>
+// CHECK:           %[[MASK:.*]] = vector.create_mask %{{.*}}, %[[DIM]] : vector<8x8xi1>
+// CHECK:           %{{.*}} = vector.transfer_read %[[TILE]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}}, %[[MASK]]{{.*}} : tensor<8x?xf32>, vector<8x8xf32>
+// CHECK:           return %{{.*}} : vector<8x8xf32>
+// CHECK:         }
+func.func @masked_2d_transfer_read_dynamic_minor_dim(%tile: tensor<8x?xf32>) -> vector<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %cst = arith.constant 0.0 : f32
+  %dim1 = tensor.dim %tile, %c1 : tensor<8x?xf32>
+  %mask = vector.create_mask %c8, %dim1 : vector<8x8xi1>
+  %v = vector.mask %mask {
+    vector.transfer_read %tile[%c0, %c0], %cst {in_bounds = [true, true]}
+      : tensor<8x?xf32>, vector<8x8xf32>
+  } : vector<8x8xi1> -> vector<8x8xf32>
+  return %v : vector<8x8xf32>
+}
 
 // -----
 
diff --git a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
index b5eb6e63f5a8d..7fd592c0f607b 100644
--- a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
@@ -108,7 +108,7 @@ func.func @transfer_read_3d(
   %f0 = arith.constant 0.0 : f32
   //      CHECK: %[[mask:.*]] = vector.create_mask
   //  CHECK-NOT: vector.mask
-  //      CHECK: vector.transfer_read {{.*}}, %[[mask]] {in_bounds = [true, true, true]}
+  //      CHECK: vector.transfer_read {{.*}}, %[[mask]]
   // CHECK-SAME:   : tensor<?x?x?xf32>, vector<2x1x7xf32>
   %0 = vector.create_mask %arg0, %arg1, %arg2 : vector<2x1x7xi1>
   %1 = vector.mask %0 {