[Mlir-commits] [mlir] [MLIR][Vector] Fix vector.create_mask i32 overflow for large index values (PR #188782)

Mon Apr 13 08:43:44 PDT 2026

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188782

>From 7dcb4a5a4a173ced57789b6e69e868658c643f8b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 07:57:44 -0700
Subject: [PATCH] [MLIR][Vector] Fix vector.create_mask i32 overflow for large
 index values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When lowering `vector.create_mask` for 1-D fixed-size vectors with
`force-32bit-vector-indices=true`, the mask bound (an `index`-typed value)
was cast directly to `i32`. For index values larger than INT32_MAX (e.g.,
2^51), this truncation wraps to a small or negative i32 value, making all
vector comparison results false — every mask element becomes 0 even when
the bound is larger than the vector dimension.

Fix: in `buildVectorComparison`, clamp the bound to `[_, dim]` using
`arith.minsi` in index type before the `i32` cast. If the bound is >= dim,
all elements should be true regardless; clamping to `dim` (which fits in
i32 since it is a compile-time vector dimension size) preserves that
semantic without overflow.

The fix applies to both `vector.create_mask` lowering and
`vector.transfer_read/write` out-of-bounds mask generation, both of which
call `buildVectorComparison`.

Fixes #113689

Assisted-by: Claude Code
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 16 ++++++++++++--
 .../Vector/Transforms/VectorTransforms.cpp    | 11 ++++++++++
 .../VectorToLLVM/vector-mask-to-llvm.mlir     |  8 +++++--
 .../VectorToLLVM/vector-to-llvm.mlir          |  8 +++++--
 .../VectorToLLVM/vector-xfer-to-llvm.mlir     | 22 ++++++++++---------
 .../Dialect/Vector/CPU/create-mask.mlir       | 16 ++++++++++++++
 6 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 49f1815a8d941..43e0824fef6cd 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1510,8 +1510,20 @@ class VectorCreateMaskOpConversion
         rewriter, loc,
         LLVM::getVectorType(idxType, dstType.getShape()[0],
                             /*isScalable=*/true));
-    auto bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType,
-                                                 adaptor.getOperands()[0]);
+    Value maskBound = adaptor.getOperands()[0];
+    // When using 32-bit indices, cap the bound at INT32_MAX in index type
+    // before casting. For scalable vectors the runtime size (vscale * dim) is
+    // unknown at compile time, so we can't clamp to `dim` as in the fixed-size
+    // path. Clamping to INT32_MAX is safe because any realistic scalable vector
+    // size fits well below this limit, so a bound >= vscale*dim still produces
+    // an all-true mask after the comparison.
+    if (force32BitVectorIndices) {
+      Value maxBound =
+          arith::ConstantIndexOp::create(rewriter, loc, (1LL << 31) - 1);
+      maskBound = arith::MinSIOp::create(rewriter, loc, maskBound, maxBound);
+    }
+    auto bound =
+        getValueOrCreateCastToIndexLike(rewriter, loc, idxType, maskBound);
     Value bounds = BroadcastOp::create(rewriter, loc, indices.getType(), bound);
     Value comp = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
                                        indices, bounds);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index c694f4f58faa1..8e6341f0f938b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1361,6 +1361,17 @@ static Value buildVectorComparison(PatternRewriter &rewriter, Operation *op,
     indices = arith::AddIOp::create(rewriter, loc, ov, indices);
   }
   // Construct the vector comparison.
+  // When using 32-bit indices, cap `b` at `dim` in index type before casting.
+  // This prevents i32 signed overflow for large index values: if `b >= dim`,
+  // all elements are set, so capping to `dim` (which fits in i32 since vector
+  // dimensions are compile-time constants) is semantically equivalent and
+  // avoids truncation artifacts (e.g., 2^51 wrapping to 0 in i32).
+  // Note: when dim == 0 (0-D vector), this cap is skipped. The correct cap
+  // for 0-D would be 1 (single element), not 0, and is not currently handled.
+  if (force32BitVectorIndices && dim > 0) {
+    Value dimCst = arith::ConstantIndexOp::create(rewriter, loc, dim);
+    b = arith::MinSIOp::create(rewriter, loc, b, dimCst);
+  }
   Value bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType, b);
   Value bounds =
       vector::BroadcastOp::create(rewriter, loc, indices.getType(), bound);
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
index 91e5358622b69..a468be92be1f4 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
@@ -4,7 +4,9 @@
 // CMP32-LABEL: @genbool_var_1d(
 // CMP32-SAME: %[[ARG:.*]]: index)
 // CMP32: %[[T0:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : vector<11xi32>
-// CMP32: %[[T1:.*]] = arith.index_cast %[[ARG]] : index to i32
+// CMP32: %[[DIM:.*]] = arith.constant 11 : index
+// CMP32: %[[CLAMPED:.*]] = arith.minsi %[[ARG]], %[[DIM]] : index
+// CMP32: %[[T1:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CMP32: %[[T2:.*]] = llvm.insertelement %[[T1]], %{{.*}}[%{{.*}} : i32] : vector<11xi32>
 // CMP32: %[[T3:.*]] = llvm.shufflevector %[[T2]], %{{.*}} [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<11xi32>
 // CMP32: %[[T4:.*]] = arith.cmpi sgt, %[[T3]], %[[T0]] : vector<11xi32>
@@ -27,7 +29,9 @@ func.func @genbool_var_1d(%arg0: index) -> vector<11xi1> {
 // CMP32-LABEL: @genbool_var_1d_scalable(
 // CMP32-SAME: %[[ARG:.*]]: index)
 // CMP32: %[[T0:.*]] = llvm.intr.stepvector : vector<[11]xi32>
-// CMP32: %[[T1:.*]] = arith.index_cast %[[ARG]] : index to i32
+// CMP32: %[[MAX:.*]] = arith.constant 2147483647 : index
+// CMP32: %[[CLAMPED:.*]] = arith.minsi %[[ARG]], %[[MAX]] : index
+// CMP32: %[[T1:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CMP32: %[[T2:.*]] = llvm.insertelement %[[T1]], %{{.*}}[%{{.*}} : i32] : vector<[11]xi32>
 // CMP32: %[[T3:.*]] = llvm.shufflevector %[[T2]], %{{.*}} [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[11]xi32>
 // CMP32: %[[T4:.*]] = arith.cmpi slt, %[[T0]], %[[T3]] : vector<[11]xi32>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d669a3bac3336..ebfda961c24e2 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1560,7 +1560,9 @@ func.func @create_mask_1d(%num_elems : index) -> vector<4xi1> {
 // CHECK-LABEL: func @create_mask_1d
 // CHECK-SAME: %[[NUM_ELEMS:.*]]: index
 // CHECK:  %[[INDICES:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xi32>
-// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[NUM_ELEMS]] : index to i32
+// CHECK:  %[[DIM:.*]] = arith.constant 4 : index
+// CHECK:  %[[CLAMPED:.*]] = arith.minsi %[[NUM_ELEMS]], %[[DIM]] : index
+// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CHECK:  %[[BOUNDS_INSERT:.*]] = llvm.insertelement %[[NUM_ELEMS_i32]]
 // CHECK:  %[[BOUNDS:.*]] = llvm.shufflevector %[[BOUNDS_INSERT]]
 // CHECK:  %[[RESULT:.*]] = arith.cmpi sgt, %[[BOUNDS]], %[[INDICES]] : vector<4xi32>
@@ -1576,7 +1578,9 @@ func.func @create_mask_1d_scalable(%num_elems : index) -> vector<[4]xi1> {
 // CHECK-LABEL: func @create_mask_1d_scalable
 // CHECK-SAME: %[[NUM_ELEMS:.*]]: index
 // CHECK:  %[[INDICES:.*]] = llvm.intr.stepvector : vector<[4]xi32>
-// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[NUM_ELEMS]] : index to i32
+// CHECK:  %[[MAX:.*]] = arith.constant 2147483647 : index
+// CHECK:  %[[CLAMPED:.*]] = arith.minsi %[[NUM_ELEMS]], %[[MAX]] : index
+// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CHECK:  %[[BOUNDS_INSERT:.*]] = llvm.insertelement %[[NUM_ELEMS_i32]], {{.*}} : vector<[4]xi32>
 // CHECK:  %[[BOUNDS:.*]] = llvm.shufflevector %[[BOUNDS_INSERT]], {{.*}} : vector<[4]xi32>
 // CHECK:  %[[RESULT:.*]] = arith.cmpi slt, %[[INDICES]], %[[BOUNDS]] : vector<[4]xi32>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
index d3f6d7eca90b4..18deadd0d7a79 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
@@ -28,13 +28,12 @@ func.func @transfer_read_write_1d(%A : memref<?xf32>, %base: index) -> vector<17
 //
 // 4. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] :
-//  CMP32-SAME: index to i32
-//  CMP64-SAME: index to i64
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi to
+//    prevent i32 overflow for large index values.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi sgt, %[[boundVect]], %[[linearIndex]] : vector<17x[[$IDX_TYPE]]>
-//  CMP64-SAME: : vector<17xi64>
 //
 // 5. Bitcast to vector form.
 //       CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}} :
@@ -51,8 +50,7 @@ func.func @transfer_read_write_1d(%A : memref<?xf32>, %base: index) -> vector<17
 //
 // 2. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %[[BOUND_b]]
-//  CMP32-SAME: index to i32
+//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert_b:.*]] = llvm.insertelement %[[btrunc_b]]
 //       CHECK: %[[boundVect_b:.*]] = llvm.shufflevector %[[boundVecInsert_b]]
 //       CHECK: %[[mask_b:.*]] = arith.cmpi sgt, %[[boundVect_b]],
@@ -93,7 +91,9 @@ func.func @transfer_read_write_1d_scalable(%A : memref<?xf32>, %base: index) ->
 //
 // 4. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi to
+//    prevent i32 overflow for large index values.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi slt, %[[linearIndex]], %[[boundVect]]
@@ -117,7 +117,7 @@ func.func @transfer_read_write_1d_scalable(%A : memref<?xf32>, %base: index) ->
 //
 // 3. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %[[BOUND_b]] : index to [[$IDX_TYPE]]
+//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert_b:.*]] = llvm.insertelement %[[btrunc_b]]
 //       CHECK: %[[boundVect_b:.*]] = llvm.shufflevector %[[boundVecInsert_b]]
 //       CHECK: %[[mask_b:.*]] = arith.cmpi slt, %[[linearIndex_b]],
@@ -200,7 +200,8 @@ func.func @transfer_read_2d_to_1d(%A : memref<?x?xf32>, %base0: index, %base1: i
 //
 // Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi sgt, %[[boundVect]], %[[linearIndex]]
@@ -225,7 +226,8 @@ func.func @transfer_read_2d_to_1d_scalable(%A : memref<?x?xf32>, %base0: index,
 //
 // Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi slt, %[[linearIndex]], %[[boundVect]]
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
index e5d6fa5bbaf0c..c80c5adfb3722 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
@@ -33,6 +33,15 @@ func.func @entry() {
   // CHECK: ( 1, 1, 1, 1, 1 )
   // CHECK: ( 1, 1, 1, 1, 1 )
 
+  // Verify that bounds larger than INT32_MAX produce an all-true mask.
+  // With force-32bit-vector-indices (the default), a naive truncating cast of
+  // 2^51 to i32 wraps to 0 and produces an all-false mask; the fix clamps the
+  // bound to `dim` before casting.
+  %large = arith.constant 2251799813685248 : index  // 2^51
+  %9 = func.call @create_mask_large_bound(%large) : (index) -> vector<5xi1>
+  vector.print %9 : vector<5xi1>
+  // CHECK: ( 1, 1, 1, 1, 1 )
+
   //
   // 2-D.
   //
@@ -112,3 +121,10 @@ func.func @entry() {
 
   return
 }
+
+// Helper for the large-bound overflow regression test: takes a runtime index
+// so that vector.create_mask cannot be constant-folded before lowering.
+func.func @create_mask_large_bound(%n : index) -> vector<5xi1> {
+  %0 = vector.create_mask %n : vector<5xi1>
+  return %0 : vector<5xi1>
+}