[Mlir-commits] [mlir] [MLIR][Vector] Fix i32 overflow in vector mask lowering for large index bounds (PR #188782)

Thu Apr 16 03:57:39 PDT 2026

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188782

>From 1d1b0f1e4def486959405b92587df11068308344 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 07:57:44 -0700
Subject: [PATCH] [MLIR][Vector] Fix i32 overflow in vector mask lowering for
 large index bounds

When lowering vector masks with `force-32bit-vector-indices=true`, the
mask bound (an `index`-typed value) was cast directly to `i32`. For values
larger than INT32_MAX (e.g., 2^51), the truncating cast wraps to a small
or negative i32, turning an all-true mask into an all-false one.

Fix both the fixed-size vector path (`buildVectorComparison` in
VectorTransforms.cpp) and the scalable vector path
(`VectorCreateMaskOpConversion` in ConvertVectorToLLVM.cpp) uniformly:
clamp the bound to INT32_MAX via `arith.minsi` in index type before the
cast. For fixed-size vectors `dim` would be a tighter bound (any b >= dim
already implies all-true), but INT32_MAX is used for consistency across
both paths.

Add a LIT regression test with a 2^51 bound and an integration test that
executes the lowered code to confirm the mask is all-true at runtime.

Fixes #113689

Assisted-by: Claude Code
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 16 ++++++++++++--
 .../Vector/Transforms/VectorTransforms.cpp    | 10 +++++++++
 .../VectorToLLVM/vector-mask-to-llvm.mlir     |  8 +++++--
 .../VectorToLLVM/vector-to-llvm.mlir          | 12 +++++++---
 .../VectorToLLVM/vector-xfer-to-llvm.mlir     | 22 ++++++++++---------
 .../Dialect/Vector/CPU/create-mask.mlir       | 16 ++++++++++++++
 6 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 49f1815a8d941..43e0824fef6cd 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1510,8 +1510,20 @@ class VectorCreateMaskOpConversion
         rewriter, loc,
         LLVM::getVectorType(idxType, dstType.getShape()[0],
                             /*isScalable=*/true));
-    auto bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType,
-                                                 adaptor.getOperands()[0]);
+    Value maskBound = adaptor.getOperands()[0];
+    // When using 32-bit indices, cap the bound at INT32_MAX in index type
+    // before casting. For scalable vectors the runtime size (vscale * dim) is
+    // unknown at compile time, so we can't clamp to `dim` as in the fixed-size
+    // path. Clamping to INT32_MAX is safe because any realistic scalable vector
+    // size fits well below this limit, so a bound >= vscale*dim still produces
+    // an all-true mask after the comparison.
+    if (force32BitVectorIndices) {
+      Value maxBound =
+          arith::ConstantIndexOp::create(rewriter, loc, (1LL << 31) - 1);
+      maskBound = arith::MinSIOp::create(rewriter, loc, maskBound, maxBound);
+    }
+    auto bound =
+        getValueOrCreateCastToIndexLike(rewriter, loc, idxType, maskBound);
     Value bounds = BroadcastOp::create(rewriter, loc, indices.getType(), bound);
     Value comp = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
                                        indices, bounds);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index a2d59010a2901..752610efc6992 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1361,6 +1361,16 @@ static Value buildVectorComparison(PatternRewriter &rewriter, Operation *op,
     indices = arith::AddIOp::create(rewriter, loc, ov, indices);
   }
   // Construct the vector comparison.
+  // When using 32-bit indices, cap `b` at INT32_MAX before casting to prevent
+  // signed overflow for large index values (e.g., 2^51 wrapping to 0 in i32).
+  // Note: for fixed-size vectors, `dim` is a tighter bound (since any b >= dim
+  // already implies all-true), but we use INT32_MAX for uniformity with the
+  // scalable-vector path.
+  if (force32BitVectorIndices) {
+    Value maxBound =
+        arith::ConstantIndexOp::create(rewriter, loc, (1LL << 31) - 1);
+    b = arith::MinSIOp::create(rewriter, loc, b, maxBound);
+  }
   Value bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType, b);
   Value bounds =
       vector::BroadcastOp::create(rewriter, loc, indices.getType(), bound);
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
index 91e5358622b69..5e3929931582a 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-mask-to-llvm.mlir
@@ -4,7 +4,9 @@
 // CMP32-LABEL: @genbool_var_1d(
 // CMP32-SAME: %[[ARG:.*]]: index)
 // CMP32: %[[T0:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : vector<11xi32>
-// CMP32: %[[T1:.*]] = arith.index_cast %[[ARG]] : index to i32
+// CMP32: %[[MAX:.*]] = arith.constant 2147483647 : index
+// CMP32: %[[CLAMPED:.*]] = arith.minsi %[[ARG]], %[[MAX]] : index
+// CMP32: %[[T1:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CMP32: %[[T2:.*]] = llvm.insertelement %[[T1]], %{{.*}}[%{{.*}} : i32] : vector<11xi32>
 // CMP32: %[[T3:.*]] = llvm.shufflevector %[[T2]], %{{.*}} [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<11xi32>
 // CMP32: %[[T4:.*]] = arith.cmpi sgt, %[[T3]], %[[T0]] : vector<11xi32>
@@ -27,7 +29,9 @@ func.func @genbool_var_1d(%arg0: index) -> vector<11xi1> {
 // CMP32-LABEL: @genbool_var_1d_scalable(
 // CMP32-SAME: %[[ARG:.*]]: index)
 // CMP32: %[[T0:.*]] = llvm.intr.stepvector : vector<[11]xi32>
-// CMP32: %[[T1:.*]] = arith.index_cast %[[ARG]] : index to i32
+// CMP32: %[[MAX:.*]] = arith.constant 2147483647 : index
+// CMP32: %[[CLAMPED:.*]] = arith.minsi %[[ARG]], %[[MAX]] : index
+// CMP32: %[[T1:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CMP32: %[[T2:.*]] = llvm.insertelement %[[T1]], %{{.*}}[%{{.*}} : i32] : vector<[11]xi32>
 // CMP32: %[[T3:.*]] = llvm.shufflevector %[[T2]], %{{.*}} [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[11]xi32>
 // CMP32: %[[T4:.*]] = arith.cmpi slt, %[[T0]], %[[T3]] : vector<[11]xi32>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d669a3bac3336..77f60b3172296 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1544,7 +1544,9 @@ func.func @create_mask_0d(%num_elems : index) -> vector<i1> {
 // CHECK-LABEL: func @create_mask_0d
 // CHECK-SAME: %[[NUM_ELEMS:.*]]: index
 // CHECK:  %[[INDICES:.*]] = arith.constant dense<0> : vector<i32>
-// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[NUM_ELEMS]] : index to i32
+// CHECK:  %[[MAX:.*]] = arith.constant 2147483647 : index
+// CHECK:  %[[CLAMPED:.*]] = arith.minsi %[[NUM_ELEMS]], %[[MAX]] : index
+// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CHECK:  %[[BOUNDS:.*]] = llvm.insertelement %[[NUM_ELEMS_i32]]
 // CHECK:  %[[BOUNDS_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOUNDS]] : vector<1xi32> to vector<i32>
 // CHECK:  %[[RESULT:.*]] = arith.cmpi sgt, %[[BOUNDS_CAST]], %[[INDICES]] : vector<i32>
@@ -1560,7 +1562,9 @@ func.func @create_mask_1d(%num_elems : index) -> vector<4xi1> {
 // CHECK-LABEL: func @create_mask_1d
 // CHECK-SAME: %[[NUM_ELEMS:.*]]: index
 // CHECK:  %[[INDICES:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xi32>
-// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[NUM_ELEMS]] : index to i32
+// CHECK:  %[[MAX:.*]] = arith.constant 2147483647 : index
+// CHECK:  %[[CLAMPED:.*]] = arith.minsi %[[NUM_ELEMS]], %[[MAX]] : index
+// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CHECK:  %[[BOUNDS_INSERT:.*]] = llvm.insertelement %[[NUM_ELEMS_i32]]
 // CHECK:  %[[BOUNDS:.*]] = llvm.shufflevector %[[BOUNDS_INSERT]]
 // CHECK:  %[[RESULT:.*]] = arith.cmpi sgt, %[[BOUNDS]], %[[INDICES]] : vector<4xi32>
@@ -1576,7 +1580,9 @@ func.func @create_mask_1d_scalable(%num_elems : index) -> vector<[4]xi1> {
 // CHECK-LABEL: func @create_mask_1d_scalable
 // CHECK-SAME: %[[NUM_ELEMS:.*]]: index
 // CHECK:  %[[INDICES:.*]] = llvm.intr.stepvector : vector<[4]xi32>
-// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[NUM_ELEMS]] : index to i32
+// CHECK:  %[[MAX:.*]] = arith.constant 2147483647 : index
+// CHECK:  %[[CLAMPED:.*]] = arith.minsi %[[NUM_ELEMS]], %[[MAX]] : index
+// CHECK:  %[[NUM_ELEMS_i32:.*]] = arith.index_cast %[[CLAMPED]] : index to i32
 // CHECK:  %[[BOUNDS_INSERT:.*]] = llvm.insertelement %[[NUM_ELEMS_i32]], {{.*}} : vector<[4]xi32>
 // CHECK:  %[[BOUNDS:.*]] = llvm.shufflevector %[[BOUNDS_INSERT]], {{.*}} : vector<[4]xi32>
 // CHECK:  %[[RESULT:.*]] = arith.cmpi slt, %[[INDICES]], %[[BOUNDS]] : vector<[4]xi32>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
index d3f6d7eca90b4..18deadd0d7a79 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
@@ -28,13 +28,12 @@ func.func @transfer_read_write_1d(%A : memref<?xf32>, %base: index) -> vector<17
 //
 // 4. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] :
-//  CMP32-SAME: index to i32
-//  CMP64-SAME: index to i64
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi to
+//    prevent i32 overflow for large index values.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi sgt, %[[boundVect]], %[[linearIndex]] : vector<17x[[$IDX_TYPE]]>
-//  CMP64-SAME: : vector<17xi64>
 //
 // 5. Bitcast to vector form.
 //       CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}} :
@@ -51,8 +50,7 @@ func.func @transfer_read_write_1d(%A : memref<?xf32>, %base: index) -> vector<17
 //
 // 2. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %[[BOUND_b]]
-//  CMP32-SAME: index to i32
+//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert_b:.*]] = llvm.insertelement %[[btrunc_b]]
 //       CHECK: %[[boundVect_b:.*]] = llvm.shufflevector %[[boundVecInsert_b]]
 //       CHECK: %[[mask_b:.*]] = arith.cmpi sgt, %[[boundVect_b]],
@@ -93,7 +91,9 @@ func.func @transfer_read_write_1d_scalable(%A : memref<?xf32>, %base: index) ->
 //
 // 4. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi to
+//    prevent i32 overflow for large index values.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi slt, %[[linearIndex]], %[[boundVect]]
@@ -117,7 +117,7 @@ func.func @transfer_read_write_1d_scalable(%A : memref<?xf32>, %base: index) ->
 //
 // 3. Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %[[BOUND_b]] : index to [[$IDX_TYPE]]
+//       CHECK: %[[btrunc_b:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert_b:.*]] = llvm.insertelement %[[btrunc_b]]
 //       CHECK: %[[boundVect_b:.*]] = llvm.shufflevector %[[boundVecInsert_b]]
 //       CHECK: %[[mask_b:.*]] = arith.cmpi slt, %[[linearIndex_b]],
@@ -200,7 +200,8 @@ func.func @transfer_read_2d_to_1d(%A : memref<?x?xf32>, %base0: index, %base1: i
 //
 // Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi sgt, %[[boundVect]], %[[linearIndex]]
@@ -225,7 +226,8 @@ func.func @transfer_read_2d_to_1d_scalable(%A : memref<?x?xf32>, %base0: index,
 //
 // Create bound vector to compute in-bound mask:
 //    [ 0 .. vector_length - 1 ] < [ dim - offset .. dim - offset ]
-//       CHECK: %[[btrunc:.*]] = arith.index_cast %[[BOUND]] : index to [[$IDX_TYPE]]
+//    Note: for 32-bit indices, the bound is first clamped via arith.minsi.
+//       CHECK: %[[btrunc:.*]] = arith.index_cast %{{.*}} : index to [[$IDX_TYPE]]
 //       CHECK: %[[boundVecInsert:.*]] = llvm.insertelement %[[btrunc]]
 //       CHECK: %[[boundVect:.*]] = llvm.shufflevector %[[boundVecInsert]]
 //       CHECK: %[[mask:.*]] = arith.cmpi slt, %[[linearIndex]], %[[boundVect]]
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
index e5d6fa5bbaf0c..6ef9ff32fc52d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
@@ -33,6 +33,15 @@ func.func @entry() {
   // CHECK: ( 1, 1, 1, 1, 1 )
   // CHECK: ( 1, 1, 1, 1, 1 )
 
+  // Verify that bounds larger than INT32_MAX produce an all-true mask.
+  // With force-32bit-vector-indices (the default), a naive truncating cast of
+  // 2^51 to i32 wraps to 0 and produces an all-false mask; the fix clamps the
+  // bound to INT32_MAX before casting.
+  %large = arith.constant 2251799813685248 : index  // 2^51
+  %9 = func.call @create_mask_large_bound(%large) : (index) -> vector<5xi1>
+  vector.print %9 : vector<5xi1>
+  // CHECK: ( 1, 1, 1, 1, 1 )
+
   //
   // 2-D.
   //
@@ -112,3 +121,10 @@ func.func @entry() {
 
   return
 }
+
+// Helper for the large-bound overflow regression test: takes a runtime index
+// so that vector.create_mask cannot be constant-folded before lowering.
+func.func @create_mask_large_bound(%n : index) -> vector<5xi1> {
+  %0 = vector.create_mask %n : vector<5xi1>
+  return %0 : vector<5xi1>
+}