[llvm] [InstCombineCompares] Try to "strengthen" compares based on known bits. (PR #79405)

Wed Jan 31 23:33:26 PST 2024

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/79405

>From 20e1c91d73bd5d140f4a5985410536294cf1320f Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Tue, 16 Jan 2024 03:58:34 -0500
Subject: [PATCH 1/7] [InstCombineCompares] Try to "strengthen" compares based
 on known bits.

For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
it is known that the two least significant bits of `%x` is zero.
---
 .../InstCombine/InstCombineCompares.cpp       | 72 ++++++++++++++++
 llvm/test/Transforms/InstCombine/icmp.ll      | 82 +++++++++++++++----
 2 files changed, 138 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0fd66225513..c81e229e1c59b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6357,6 +6357,78 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
        (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
     return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
 
+  // Try to "strengthen" the RHS of compare based on known bits.
+  // For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
+  // it is known that the two least significant bits of `%x` is zero.
+  if (Op1Known.isConstant() && Op0Known.Zero.isMask()) {
+    APInt RHSConst = Op1Known.getConstant();
+    ConstantRange Op0PredRange =
+        ConstantRange::makeExactICmpRegion(Pred, RHSConst);
+    int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
+    if (KnownZeroMaskLength > 0) {
+      APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
+      APInt Op0PredMin(BitWidth, 0);
+      APInt Op0PredMax(BitWidth, 0);
+      APInt Op0MinRefinedByKnownBits(BitWidth, 0);
+      APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
+      APInt NewLower(BitWidth, 0);
+      APInt NewUpper(BitWidth, 0);
+      bool ImprovedLower = false;
+      bool ImprovedUpper = false;
+      if (I.isSigned()) {
+        Op0PredMin = Op0PredRange.getSignedMin();
+        Op0PredMax = Op0PredRange.getSignedMax();
+        // Compute the smallest number satisfying the known-bits constrained
+        // which is at greater or equal Op0PredMin.
+        Op0MinRefinedByKnownBits =
+            PowOf2 *
+            APIntOps::RoundingSDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
+        // Compute the largest number satisfying the known-bits constrained
+        // which is at less or equal Op0PredMax.
+        Op0MaxRefinedByKnownBits =
+            PowOf2 *
+            APIntOps::RoundingSDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
+        NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0PredMin);
+        NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0PredMax);
+        ImprovedLower = NewLower.sgt(Op0PredMin);
+        ImprovedUpper = NewUpper.slt(Op0PredMax);
+      } else {
+        Op0PredMin = Op0PredRange.getUnsignedMin();
+        Op0PredMax = Op0PredRange.getUnsignedMax();
+        Op0MinRefinedByKnownBits =
+            PowOf2 *
+            APIntOps::RoundingUDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
+        Op0MaxRefinedByKnownBits =
+            PowOf2 *
+            APIntOps::RoundingUDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
+        NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0PredMin);
+        NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0PredMax);
+        ImprovedLower = NewLower.ugt(Op0PredMin);
+        ImprovedUpper = NewUpper.ult(Op0PredMax);
+      }
+
+      // Non-strict inequalities should have been canonicalized to strict ones
+      // by now.
+      switch (Pred) {
+      default:
+        break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT: {
+        if (ImprovedUpper)
+          return new ICmpInst(Pred, Op0,
+                              ConstantInt::get(Op1->getType(), NewUpper + 1));
+        break;
+      }
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT: {
+        if (ImprovedLower)
+          return new ICmpInst(Pred, Op0,
+                              ConstantInt::get(Op1->getType(), NewLower - 1));
+        break;
+      }
+      }
+    }
+  }
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 10ab1fe118348..7b96f908c69c1 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -1490,8 +1490,8 @@ define <2 x i1> @test70vec(<2 x i32> %X) {
 
 define i1 @icmp_sext16trunc(i32 %x) {
 ; CHECK-LABEL: @icmp_sext16trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT:    [[SEXT1:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 2293761
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %trunc = trunc i32 %x to i16
@@ -1502,8 +1502,8 @@ define i1 @icmp_sext16trunc(i32 %x) {
 
 define i1 @icmp_sext8trunc(i32 %x) {
 ; CHECK-LABEL: @icmp_sext8trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT:    [[SEXT1:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 587202561
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %trunc = trunc i32 %x to i8
@@ -1515,8 +1515,8 @@ define i1 @icmp_sext8trunc(i32 %x) {
 ; Vectors should fold the same way.
 define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_sext8trunc_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 36, i8 36>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 24, i32 24>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], <i32 587202561, i32 587202561>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %trunc = trunc <2 x i32> %x to <2 x i8>
@@ -1527,8 +1527,8 @@ define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
 
 define i1 @icmp_shl16(i32 %x) {
 ; CHECK-LABEL: @icmp_shl16(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2293761
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 16
@@ -1541,7 +1541,7 @@ define i1 @icmp_shl16(i32 %x) {
 define i1 @icmp_shl17(i32 %x) {
 ; CHECK-LABEL: @icmp_shl17(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2359296
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 2228225
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 17
@@ -1551,8 +1551,8 @@ define i1 @icmp_shl17(i32 %x) {
 
 define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_shl16_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i16> [[TMP1]], <i16 36, i16 36>
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 16, i32 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[SHL]], <i32 2293761, i32 2293761>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shl = shl <2 x i32> %x, <i32 16, i32 16>
@@ -1562,8 +1562,8 @@ define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
 
 define i1 @icmp_shl24(i32 %x) {
 ; CHECK-LABEL: @icmp_shl24(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 587202561
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 24
@@ -2199,7 +2199,7 @@ define i1 @icmp_ashr_and_overshift(i8 %X) {
 define i1 @icmp_and_ashr_neg_and_legal(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_and_legal(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -2225,7 +2225,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) {
 define i1 @icmp_and_ashr_neg_cmp_slt_legal(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_legal(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], -64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], -95
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -2239,7 +2239,7 @@ define i1 @icmp_and_ashr_neg_cmp_slt_shiftout(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_shiftout(
 ; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[ASHR]], -2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[AND]], -68
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[AND]], -69
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -5183,3 +5183,53 @@ entry:
   %cmp = icmp eq i8 %add2, %add1
   ret i1 %cmp
 }
+
+define i1 @tighten_icmp_using_known_bits_ugt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_ugt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], 15
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp ugt i16 %and_, 14
+  ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_ult(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_ult(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[AND_]], 17
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp ult i16 %and_, 18
+  ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_sgt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_sgt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65520
+  %cmp = icmp sgt i16 %and_, -15
+  ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_slt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[AND_]], -15
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %and_ = and i16 %a, 65532
+  %cmp = icmp slt i16 %and_, -14
+  ret i1 %cmp
+}

>From ec9315d8239327b56bb0cd433ab75cc999149dff Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 00:15:03 -0500
Subject: [PATCH 2/7] Updated tests.

---
 .../InstCombine/2007-10-31-RangeCrash.ll      |  2 +-
 .../InstCombine/assume-loop-align.ll          |  2 +-
 .../InstCombine/fold-signbit-test-power2.ll   |  8 +++--
 llvm/test/Transforms/InstCombine/icmp-mul.ll  |  2 +-
 llvm/test/Transforms/InstCombine/icmp-or.ll   |  2 +-
 .../Transforms/InstCombine/icmp-shl-nsw.ll    |  4 +--
 .../InstCombine/indexed-gep-compares.ll       | 10 +++---
 .../test/Transforms/InstCombine/opaque-ptr.ll |  4 +--
 llvm/test/Transforms/InstCombine/pr17827.ll   | 18 +++++-----
 llvm/test/Transforms/InstCombine/pr27343.ll   |  4 +--
 llvm/test/Transforms/InstCombine/select.ll    |  2 +-
 llvm/test/Transforms/InstCombine/shift.ll     | 12 +++----
 .../InstCombine/shl-unsigned-cmp-const.ll     | 34 +++++++++----------
 13 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
index 8b472aa5af090..d4ebeba0c86ea 100644
--- a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
+++ b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
@@ -17,7 +17,7 @@ define i32 @test() {
 ; CHECK-NEXT:    br label [[BB51_I_I]]
 ; CHECK:       bb51.i.i:
 ; CHECK-NEXT:    [[X_0_I_I]] = phi i32 [ [[TMP50_I_I]], [[BB27_I_I:%.*]] ], [ 0, [[BB_I]] ]
-; CHECK-NEXT:    [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], 0
+; CHECK-NEXT:    [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], -1
 ; CHECK-NEXT:    br i1 [[TMP54_I_I]], label [[BB27_I_I]], label [[BB57_I_I:%.*]]
 ; CHECK:       bb57.i.i:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/assume-loop-align.ll b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
index e7eb18c61b6bb..79af1b0fede4b 100644
--- a/llvm/test/Transforms/InstCombine/assume-loop-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
@@ -28,7 +28,7 @@ define void @foo(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1648
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1633
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
index f5024664f58c3..fc176f00486c9 100644
--- a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
+++ b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
@@ -26,7 +26,9 @@ define i1 @pow2_or_zero_is_negative(i8 %x) {
 define i1 @pow2_or_zero_is_negative_commute(i8 %A) {
 ; CHECK-LABEL: @pow2_or_zero_is_negative_commute(
 ; CHECK-NEXT:    [[X:%.*]] = mul i8 [[A:%.*]], 42
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[NEG:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT:    [[POW2_OR_ZERO:%.*]] = and i8 [[X]], [[NEG]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[POW2_OR_ZERO]], -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x = mul i8 42, %A ; thwart complexity-based canonicalization
@@ -54,7 +56,9 @@ define <2 x i1> @pow2_or_zero_is_negative_vec(<2 x i8> %x) {
 define <2 x i1> @pow2_or_zero_is_negative_vec_commute(<2 x i8> %A) {
 ; CHECK-LABEL: @pow2_or_zero_is_negative_vec_commute(
 ; CHECK-NEXT:    [[X:%.*]] = mul <2 x i8> [[A:%.*]], <i8 42, i8 42>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[POW2_OR_ZERO:%.*]] = and <2 x i8> [[X]], [[NEG]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[POW2_OR_ZERO]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %x = mul <2 x i8> <i8 42, i8 42>, %A ; thwart complexity-based canonicalization
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 7f76a94f395b6..6c6befcfe39e4 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -969,7 +969,7 @@ define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 2
 ; CHECK-NEXT:    [[S:%.*]] = sext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 254
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 255
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %b = and i32 %x, 2
diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll
index 922845c1e7e2d..587df66417eb0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-or.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-or.ll
@@ -308,7 +308,7 @@ define i1 @decrement_sgt_n1_commute_use1(i8 %px) {
 ; CHECK-NEXT:    [[X:%.*]] = mul i8 [[PX:%.*]], 42
 ; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[X]], -1
 ; CHECK-NEXT:    call void @use(i8 [[DEC]])
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[X]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = mul i8 %px, 42 ; thwart complexity-based canonicalization
diff --git a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
index 6b9ea1f8ef97e..5b827c839a4e4 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -136,7 +136,7 @@ define i1 @icmp_sgt6(i8 %x) {
 
 define i1 @icmp_sgt7(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt7(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], 62
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 63
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -224,7 +224,7 @@ define i1 @icmp_sle1(i8 %x) {
 
 define i1 @icmp_sle2(i8 %x) {
 ; CHECK-LABEL: @icmp_sle2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], -63
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -64
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
index 2b5b3fce70535..bd32a9270f224 100644
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -11,7 +11,7 @@ define ptr at test1(ptr %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[RHS_IDX]]
@@ -40,7 +40,7 @@ define ptr at test2(i32 %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[A_PTR:%.*]] = inttoptr i32 [[A:%.*]] to ptr
@@ -164,7 +164,7 @@ define ptr at test4(i16 %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
@@ -203,7 +203,7 @@ define ptr at test5(i32 %Offset) personality ptr @__gxx_personality_v0 {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[RHS_IDX]]
@@ -248,7 +248,7 @@ define ptr at test6(i32 %Offset) personality ptr @__gxx_personality_v0 {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[A_PTR:%.*]] = inttoptr i32 [[A]] to ptr
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 4d38e2cd37c95..f92c27cd6b07d 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -387,7 +387,7 @@ define ptr @indexed_compare(ptr %A, i64 %offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 403
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
@@ -416,7 +416,7 @@ define ptr @indexed_compare_different_types(ptr %A, i64 %offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 800
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 803
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
diff --git a/llvm/test/Transforms/InstCombine/pr17827.ll b/llvm/test/Transforms/InstCombine/pr17827.ll
index 6c6110aa073a5..d87909a283495 100644
--- a/llvm/test/Transforms/InstCombine/pr17827.ll
+++ b/llvm/test/Transforms/InstCombine/pr17827.ll
@@ -6,7 +6,7 @@ define i1 @test_shift_and_cmp_not_changed1(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed1(
 ; CHECK-NEXT:    [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl i8 %p, 5
@@ -20,7 +20,7 @@ define i1 @test_shift_and_cmp_not_changed2(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed2(
 ; CHECK-NEXT:    [[SHLP:%.*]] = ashr i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = ashr i8 %p, 5
@@ -35,7 +35,7 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[ANDP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %andp = and i8 %p, 6
@@ -51,7 +51,7 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1_vec(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[ANDP]], <i8 -64, i8 -64>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 32, i8 32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 1, i8 1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %andp = and <2 x i8> %p, <i8 6, i8 6>
@@ -66,8 +66,8 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; Unsigned compare allows a transformation to compare against 0.
 define i1 @test_shift_and_cmp_changed2(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed2(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[P:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHLP]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl i8 %p, 5
@@ -78,8 +78,8 @@ define i1 @test_shift_and_cmp_changed2(i8 %p) {
 
 define <2 x i1> @test_shift_and_cmp_changed2_vec(<2 x i8> %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed2_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[P:%.*]], <i8 6, i8 6>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHLP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> [[SHLP]], <i8 33, i8 33>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shlp = shl <2 x i8> %p, <i8 5, i8 5>
@@ -93,7 +93,7 @@ define i1 @test_shift_and_cmp_changed3(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed3(
 ; CHECK-NEXT:    [[SHLP:%.*]] = shl nsw i8 [[P:%.*]], 5
 ; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl nsw i8 %p, 5
diff --git a/llvm/test/Transforms/InstCombine/pr27343.ll b/llvm/test/Transforms/InstCombine/pr27343.ll
index e67d0b34056bf..f16affde2ce41 100644
--- a/llvm/test/Transforms/InstCombine/pr27343.ll
+++ b/llvm/test/Transforms/InstCombine/pr27343.ll
@@ -6,7 +6,7 @@ define i32 @__isnan(float %x) alwaysinline nounwind optsize {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[DOTCAST]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777215
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -24,7 +24,7 @@ entry:
 define i1 @icmp_shl7(i32 %x) {
 ; CHECK-LABEL: @icmp_shl7(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4481
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 7
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d740..65b2c978c36c0 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3423,7 +3423,7 @@ define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
 ; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 55, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index d783adbe93863..2c2d6c921e55d 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -515,8 +515,8 @@ define i32 @test32(i32 %A, i32 %B, i32 %C) {
 
 define i1 @test33(i32 %X) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    [[I1_MASK:%.*]] = and i32 [[X:%.*]], 16777216
-; CHECK-NEXT:    [[I2:%.*]] = icmp ne i32 [[I1_MASK]], 0
+; CHECK-NEXT:    [[I1:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[I2:%.*]] = icmp slt i32 [[I1]], -127
 ; CHECK-NEXT:    ret i1 [[I2]]
 ;
   %i1 = shl i32 %X, 7
@@ -526,8 +526,8 @@ define i1 @test33(i32 %X) {
 
 define <2 x i1> @test33vec(<2 x i32> %X) {
 ; CHECK-LABEL: @test33vec(
-; CHECK-NEXT:    [[I1_MASK:%.*]] = and <2 x i32> [[X:%.*]], <i32 16777216, i32 16777216>
-; CHECK-NEXT:    [[I2:%.*]] = icmp ne <2 x i32> [[I1_MASK]], zeroinitializer
+; CHECK-NEXT:    [[I1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 7, i32 7>
+; CHECK-NEXT:    [[I2:%.*]] = icmp slt <2 x i32> [[I1]], <i32 -127, i32 -127>
 ; CHECK-NEXT:    ret <2 x i1> [[I2]]
 ;
   %i1 = shl <2 x i32> %X, <i32 7, i32 7>
@@ -658,8 +658,8 @@ define i8 @test39(i32 %a0) {
 ; CHECK-NEXT:    [[I51:%.*]] = xor i8 [[I50]], [[I5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr exact i8 [[I5]], 3
 ; CHECK-NEXT:    [[I54:%.*]] = and i8 [[TMP0]], 16
-; CHECK-NEXT:    [[I551:%.*]] = or disjoint i8 [[I54]], [[I51]]
-; CHECK-NEXT:    ret i8 [[I551]]
+; CHECK-NEXT:    [[I55:%.*]] = or disjoint i8 [[I54]], [[I51]]
+; CHECK-NEXT:    ret i8 [[I55]]
 ;
 entry:
   %i4 = trunc i32 %a0 to i8
diff --git a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
index 25b26770c366d..9e1473a621d27 100644
--- a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
+++ b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
@@ -9,8 +9,8 @@
 ; C2 Shift amount smaller than C1 trailing zeros.
 define i1 @scalar_i8_shl_ult_const_1(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ult_const_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -45,8 +45,8 @@ define i1 @scalar_i8_shl_ult_const_3(i8 %x) {
 ; C2 Shift amount smaller than C1 trailing zeros.
 define i1 @scalar_i16_shl_ult_const(i16 %x) {
 ; CHECK-LABEL: @scalar_i16_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], 252
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[X:%.*]], 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[SHL]], 769
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i16 %x, 8
@@ -56,8 +56,8 @@ define i1 @scalar_i16_shl_ult_const(i16 %x) {
 
 define i1 @scalar_i32_shl_ult_const(i32 %x) {
 ; CHECK-LABEL: @scalar_i32_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 2097088
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 11
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SHL]], 129025
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i32 %x, 11
@@ -67,8 +67,8 @@ define i1 @scalar_i32_shl_ult_const(i32 %x) {
 
 define i1 @scalar_i64_shl_ult_const(i64 %x) {
 ; CHECK-LABEL: @scalar_i64_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 549755813632
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[X:%.*]], 25
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[SHL]], 8556380161
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i64 %x, 25
@@ -91,8 +91,8 @@ define i1 @scalar_i8_shl_uge_const(i8 %x) {
 ; Check 'ule' predicate
 define i1 @scalar_i8_shl_ule_const(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ule_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -116,8 +116,8 @@ define i1 @scalar_i8_shl_ugt_const(i8 %x) {
 
 define <4 x i1> @vector_4xi32_shl_ult_const(<4 x i32> %x) {
 ; CHECK-LABEL: @vector_4xi32_shl_ult_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
 ; CHECK-NEXT:    ret <4 x i1> [[CMP]]
 ;
   %shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -173,8 +173,8 @@ define <4 x i1> @vector_4xi32_shl_uge_const(<4 x i32> %x) {
 ; Check 'ule' predicate
 define <4 x i1> @vector_4xi32_shl_ule_const(<4 x i32> %x) {
 ; CHECK-LABEL: @vector_4xi32_shl_ule_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
 ; CHECK-NEXT:    ret <4 x i1> [[CMP]]
 ;
   %shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -201,7 +201,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
 ; CHECK-LABEL: @scalar_i8_shl_ult_const_extra_use_shl(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
 ; CHECK-NEXT:    store i8 [[SHL]], ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -216,7 +216,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
 define i1 @scalar_i8_shl_slt_const(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_slt_const(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[SHL]], 64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[SHL]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5
@@ -227,7 +227,7 @@ define i1 @scalar_i8_shl_slt_const(i8 %x) {
 define i1 @scalar_i8_shl_ugt_const_not_power_of_2(i8 %x) {
 ; CHECK-LABEL: @scalar_i8_shl_ugt_const_not_power_of_2(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SHL]], 66
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SHL]], 95
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl i8 %x, 5

>From 91b7eb1269a04bfed17e7b8081e9dcf847f62fef Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:17:37 -0500
Subject: [PATCH 3/7] Put the transformation in a separate function. Added a
 check to prevent breakin of "sign bit check" pattern.

---
 .../InstCombine/InstCombineCompares.cpp       | 166 ++++++++++--------
 .../InstCombine/fold-signbit-test-power2.ll   |   8 +-
 2 files changed, 97 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c81e229e1c59b..8fbc3277a3fa9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6100,6 +6100,99 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
   return false;
 }
 
+// Try to "strengthen" the RHS of compare based on known bits.
+// For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
+// it is known that the two least significant bits of `%x` is zero.
+static Instruction* strengthenICmpUsingKnownBits(
+  ICmpInst &I,
+  KnownBits Op0Known,
+  KnownBits Op1Known,
+  unsigned BitWidth
+) {
+  if (!BitWidth)
+    return nullptr;
+  if (!(Op1Known.isConstant() && Op0Known.Zero.isMask()))
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  ICmpInst::Predicate Pred = I.getPredicate();
+  Type *Ty = Op0->getType();
+  APInt RHSConst = Op1Known.getConstant();
+  bool TrueIfSigned = false;
+  // Don't break the SignBitCheck pattern;
+  if (InstCombiner::isSignBitCheck(Pred, RHSConst, TrueIfSigned))
+    return nullptr;
+
+  ConstantRange Op0PredRange =
+      ConstantRange::makeExactICmpRegion(Pred, RHSConst);
+  int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
+  if (KnownZeroMaskLength == 0)
+    return nullptr;
+
+  APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
+  APInt Op0MinAccordingToPred(BitWidth, 0);
+  APInt Op0MaxAccordingToPred(BitWidth, 0);
+  APInt Op0MinRefinedByKnownBits(BitWidth, 0);
+  APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
+  APInt NewLower(BitWidth, 0);
+  APInt NewUpper(BitWidth, 0);
+  bool ImprovedLower = false;
+  bool ImprovedUpper = false;
+  if (I.isSigned()) {
+    Op0MinAccordingToPred = Op0PredRange.getSignedMin();
+    Op0MaxAccordingToPred = Op0PredRange.getSignedMax();
+    // Compute the smallest number satisfying the known-bits constrained
+    // which is at greater or equal Op0MinAccordingToPred.
+    Op0MinRefinedByKnownBits =
+        PowOf2 *
+        APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+    // Compute the largest number satisfying the known-bits constrained
+    // which is at less or equal Op0MaxAccordingToPred.
+    Op0MaxRefinedByKnownBits =
+        PowOf2 *
+        APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+    NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+    NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+    ImprovedLower = NewLower.sgt(Op0MinAccordingToPred);
+    ImprovedUpper = NewUpper.slt(Op0MaxAccordingToPred);
+  } else {
+    Op0MinAccordingToPred = Op0PredRange.getUnsignedMin();
+    Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax();
+    Op0MinRefinedByKnownBits =
+        PowOf2 *
+        APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+    Op0MaxRefinedByKnownBits =
+        PowOf2 *
+        APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+    NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+    NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+    ImprovedLower = NewLower.ugt(Op0MinAccordingToPred);
+    ImprovedUpper = NewUpper.ult(Op0MaxAccordingToPred);
+  }
+
+  // Non-strict inequalities should have been canonicalized to strict ones
+  // by now.
+  switch (Pred) {
+  default:
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: {
+    if (ImprovedUpper)
+      return new ICmpInst(Pred, Op0,
+                          ConstantInt::get(Ty, NewUpper + 1));
+    break;
+  }
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: {
+    if (ImprovedLower)
+      return new ICmpInst(Pred, Op0,
+                          ConstantInt::get(Ty, NewLower - 1));
+    break;
+  }
+  }
+  return nullptr;
+}
+
 /// Try to fold the comparison based on range information we can get by checking
 /// whether bits are known to be zero or one in the inputs.
 Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
@@ -6357,78 +6450,9 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
        (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
     return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
 
-  // Try to "strengthen" the RHS of compare based on known bits.
-  // For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
-  // it is known that the two least significant bits of `%x` is zero.
-  if (Op1Known.isConstant() && Op0Known.Zero.isMask()) {
-    APInt RHSConst = Op1Known.getConstant();
-    ConstantRange Op0PredRange =
-        ConstantRange::makeExactICmpRegion(Pred, RHSConst);
-    int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
-    if (KnownZeroMaskLength > 0) {
-      APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
-      APInt Op0PredMin(BitWidth, 0);
-      APInt Op0PredMax(BitWidth, 0);
-      APInt Op0MinRefinedByKnownBits(BitWidth, 0);
-      APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
-      APInt NewLower(BitWidth, 0);
-      APInt NewUpper(BitWidth, 0);
-      bool ImprovedLower = false;
-      bool ImprovedUpper = false;
-      if (I.isSigned()) {
-        Op0PredMin = Op0PredRange.getSignedMin();
-        Op0PredMax = Op0PredRange.getSignedMax();
-        // Compute the smallest number satisfying the known-bits constrained
-        // which is at greater or equal Op0PredMin.
-        Op0MinRefinedByKnownBits =
-            PowOf2 *
-            APIntOps::RoundingSDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
-        // Compute the largest number satisfying the known-bits constrained
-        // which is at less or equal Op0PredMax.
-        Op0MaxRefinedByKnownBits =
-            PowOf2 *
-            APIntOps::RoundingSDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
-        NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0PredMin);
-        NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0PredMax);
-        ImprovedLower = NewLower.sgt(Op0PredMin);
-        ImprovedUpper = NewUpper.slt(Op0PredMax);
-      } else {
-        Op0PredMin = Op0PredRange.getUnsignedMin();
-        Op0PredMax = Op0PredRange.getUnsignedMax();
-        Op0MinRefinedByKnownBits =
-            PowOf2 *
-            APIntOps::RoundingUDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
-        Op0MaxRefinedByKnownBits =
-            PowOf2 *
-            APIntOps::RoundingUDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
-        NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0PredMin);
-        NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0PredMax);
-        ImprovedLower = NewLower.ugt(Op0PredMin);
-        ImprovedUpper = NewUpper.ult(Op0PredMax);
-      }
+  if (Instruction * Res = strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
+    return Res;
 
-      // Non-strict inequalities should have been canonicalized to strict ones
-      // by now.
-      switch (Pred) {
-      default:
-        break;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_SLT: {
-        if (ImprovedUpper)
-          return new ICmpInst(Pred, Op0,
-                              ConstantInt::get(Op1->getType(), NewUpper + 1));
-        break;
-      }
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_SGT: {
-        if (ImprovedLower)
-          return new ICmpInst(Pred, Op0,
-                              ConstantInt::get(Op1->getType(), NewLower - 1));
-        break;
-      }
-      }
-    }
-  }
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
index fc176f00486c9..f5024664f58c3 100644
--- a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
+++ b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
@@ -26,9 +26,7 @@ define i1 @pow2_or_zero_is_negative(i8 %x) {
 define i1 @pow2_or_zero_is_negative_commute(i8 %A) {
 ; CHECK-LABEL: @pow2_or_zero_is_negative_commute(
 ; CHECK-NEXT:    [[X:%.*]] = mul i8 [[A:%.*]], 42
-; CHECK-NEXT:    [[NEG:%.*]] = sub i8 0, [[X]]
-; CHECK-NEXT:    [[POW2_OR_ZERO:%.*]] = and i8 [[X]], [[NEG]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[POW2_OR_ZERO]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], -128
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x = mul i8 42, %A ; thwart complexity-based canonicalization
@@ -56,9 +54,7 @@ define <2 x i1> @pow2_or_zero_is_negative_vec(<2 x i8> %x) {
 define <2 x i1> @pow2_or_zero_is_negative_vec_commute(<2 x i8> %A) {
 ; CHECK-LABEL: @pow2_or_zero_is_negative_vec_commute(
 ; CHECK-NEXT:    [[X:%.*]] = mul <2 x i8> [[A:%.*]], <i8 42, i8 42>
-; CHECK-NEXT:    [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[POW2_OR_ZERO:%.*]] = and <2 x i8> [[X]], [[NEG]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[POW2_OR_ZERO]], <i8 -1, i8 -1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %x = mul <2 x i8> <i8 42, i8 42>, %A ; thwart complexity-based canonicalization

>From be9c0c715550793b4cc57925377763dd0a661a26 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:23:54 -0500
Subject: [PATCH 4/7] updated
 test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll

---
 .../sve-interleaved-masked-accesses.ll        | 280 +++++++++---------
 1 file changed, 143 insertions(+), 137 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e7..65ee0734ebafc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -28,45 +28,45 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl i32 [[INDEX]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint i32 [[TMP10]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP15]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -1
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -111,16 +111,18 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -128,31 +130,31 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP12]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint i32 [[TMP11]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP17]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -220,36 +222,36 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP11]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -286,16 +288,18 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -303,22 +307,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP17]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -382,19 +386,19 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -403,18 +407,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP8]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP13]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -457,16 +461,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = shl i32 [[TMP16]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -476,24 +482,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP16]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:

>From 4488bcb6c5cbd74b71e60f9e4928635358b5c009 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:27:58 -0500
Subject: [PATCH 5/7] Format.

---
 .../InstCombine/InstCombineCompares.cpp       | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8fbc3277a3fa9..cc09b505b5b41 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6103,12 +6103,10 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
 // Try to "strengthen" the RHS of compare based on known bits.
 // For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
 // it is known that the two least significant bits of `%x` is zero.
-static Instruction* strengthenICmpUsingKnownBits(
-  ICmpInst &I,
-  KnownBits Op0Known,
-  KnownBits Op1Known,
-  unsigned BitWidth
-) {
+static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I,
+                                                 KnownBits Op0Known,
+                                                 KnownBits Op1Known,
+                                                 unsigned BitWidth) {
   if (!BitWidth)
     return nullptr;
   if (!(Op1Known.isConstant() && Op0Known.Zero.isMask()))
@@ -6144,13 +6142,13 @@ static Instruction* strengthenICmpUsingKnownBits(
     // Compute the smallest number satisfying the known-bits constrained
     // which is at greater or equal Op0MinAccordingToPred.
     Op0MinRefinedByKnownBits =
-        PowOf2 *
-        APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+        PowOf2 * APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2,
+                                        APInt::Rounding::UP);
     // Compute the largest number satisfying the known-bits constrained
     // which is at less or equal Op0MaxAccordingToPred.
     Op0MaxRefinedByKnownBits =
-        PowOf2 *
-        APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+        PowOf2 * APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2,
+                                        APInt::Rounding::DOWN);
     NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
     NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
     ImprovedLower = NewLower.sgt(Op0MinAccordingToPred);
@@ -6159,11 +6157,11 @@ static Instruction* strengthenICmpUsingKnownBits(
     Op0MinAccordingToPred = Op0PredRange.getUnsignedMin();
     Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax();
     Op0MinRefinedByKnownBits =
-        PowOf2 *
-        APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+        PowOf2 * APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2,
+                                        APInt::Rounding::UP);
     Op0MaxRefinedByKnownBits =
-        PowOf2 *
-        APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+        PowOf2 * APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2,
+                                        APInt::Rounding::DOWN);
     NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
     NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
     ImprovedLower = NewLower.ugt(Op0MinAccordingToPred);
@@ -6178,15 +6176,13 @@ static Instruction* strengthenICmpUsingKnownBits(
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_SLT: {
     if (ImprovedUpper)
-      return new ICmpInst(Pred, Op0,
-                          ConstantInt::get(Ty, NewUpper + 1));
+      return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewUpper + 1));
     break;
   }
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_SGT: {
     if (ImprovedLower)
-      return new ICmpInst(Pred, Op0,
-                          ConstantInt::get(Ty, NewLower - 1));
+      return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewLower - 1));
     break;
   }
   }
@@ -6450,7 +6446,8 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
        (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
     return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
 
-  if (Instruction * Res = strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
+  if (Instruction *Res =
+          strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
     return Res;
 
   return nullptr;

>From 70fc18cdc2bc240ee89cfbaef7eb8f9fd222d6b6 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Fri, 26 Jan 2024 23:16:21 -0500
Subject: [PATCH 6/7] Updated tests.

---
 .../InstCombine/2007-10-31-RangeCrash.ll      |  2 +-
 llvm/test/Transforms/InstCombine/shift.ll     |  8 ++--
 .../sve-interleaved-masked-accesses.ll        | 44 +++++++++----------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
index d4ebeba0c86ea..8b472aa5af090 100644
--- a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
+++ b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
@@ -17,7 +17,7 @@ define i32 @test() {
 ; CHECK-NEXT:    br label [[BB51_I_I]]
 ; CHECK:       bb51.i.i:
 ; CHECK-NEXT:    [[X_0_I_I]] = phi i32 [ [[TMP50_I_I]], [[BB27_I_I:%.*]] ], [ 0, [[BB_I]] ]
-; CHECK-NEXT:    [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], -1
+; CHECK-NEXT:    [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], 0
 ; CHECK-NEXT:    br i1 [[TMP54_I_I]], label [[BB27_I_I]], label [[BB57_I_I:%.*]]
 ; CHECK:       bb57.i.i:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 2c2d6c921e55d..dda06351482fb 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -515,8 +515,8 @@ define i32 @test32(i32 %A, i32 %B, i32 %C) {
 
 define i1 @test33(i32 %X) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    [[I1:%.*]] = shl i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[I2:%.*]] = icmp slt i32 [[I1]], -127
+; CHECK-NEXT:    [[I1_MASK:%.*]] = and i32 [[X:%.*]], 16777216
+; CHECK-NEXT:    [[I2:%.*]] = icmp ne i32 [[I1_MASK]], 0
 ; CHECK-NEXT:    ret i1 [[I2]]
 ;
   %i1 = shl i32 %X, 7
@@ -526,8 +526,8 @@ define i1 @test33(i32 %X) {
 
 define <2 x i1> @test33vec(<2 x i32> %X) {
 ; CHECK-LABEL: @test33vec(
-; CHECK-NEXT:    [[I1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 7, i32 7>
-; CHECK-NEXT:    [[I2:%.*]] = icmp slt <2 x i32> [[I1]], <i32 -127, i32 -127>
+; CHECK-NEXT:    [[I1_MASK:%.*]] = and <2 x i32> [[X:%.*]], <i32 16777216, i32 16777216>
+; CHECK-NEXT:    [[I2:%.*]] = icmp ne <2 x i32> [[I1_MASK]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[I2]]
 ;
   %i1 = shl <2 x i32> %X, <i32 7, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 65ee0734ebafc..082d8146bfce2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -132,23 +132,23 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP12]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP14]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint i32 [[TMP11]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint i32 [[TMP12]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP18]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP17]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
@@ -313,11 +313,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP17]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -485,16 +485,16 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP16]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP17]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP16]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]

>From ff6f79797e023b91c9cba0e5b9dcc6cf67662580 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 1 Feb 2024 02:08:11 -0500
Subject: [PATCH 7/7] Added a check not to break any "select patterns".

---
 .../Transforms/InstCombine/InstCombineCompares.cpp  | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cc09b505b5b41..b42c32809e92a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
@@ -6117,10 +6118,22 @@ static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I,
   Type *Ty = Op0->getType();
   APInt RHSConst = Op1Known.getConstant();
   bool TrueIfSigned = false;
+
   // Don't break the SignBitCheck pattern;
   if (InstCombiner::isSignBitCheck(Pred, RHSConst, TrueIfSigned))
     return nullptr;
 
+  for (const Use &U : I.uses()) {
+    const Instruction *UI = cast<Instruction>(U.getUser());
+    // Don't break any select patterns.
+    Value* LHS;
+    Value* RHS;
+    if (const SelectInst *Sel = dyn_cast<SelectInst>(UI)) {
+      if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
+        return nullptr;
+    }
+  }
+
   ConstantRange Op0PredRange =
       ConstantRange::makeExactICmpRegion(Pred, RHSConst);
   int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();