[llvm] [InstCombineCompares] Try to "strengthen" compares based on known bits. (PR #79405)
Mikhail Gudim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 5 19:50:06 PST 2024
https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/79405
>From 20e1c91d73bd5d140f4a5985410536294cf1320f Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Tue, 16 Jan 2024 03:58:34 -0500
Subject: [PATCH 1/8] [InstCombineCompares] Try to "strengthen" compares based
on known bits.
For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
it is known that the two least significant bits of `%x` is zero.
---
.../InstCombine/InstCombineCompares.cpp | 72 ++++++++++++++++
llvm/test/Transforms/InstCombine/icmp.ll | 82 +++++++++++++++----
2 files changed, 138 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0fd662255130..c81e229e1c59bd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6357,6 +6357,78 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+ // Try to "strengthen" the RHS of compare based on known bits.
+ // For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
+ // it is known that the two least significant bits of `%x` is zero.
+ if (Op1Known.isConstant() && Op0Known.Zero.isMask()) {
+ APInt RHSConst = Op1Known.getConstant();
+ ConstantRange Op0PredRange =
+ ConstantRange::makeExactICmpRegion(Pred, RHSConst);
+ int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
+ if (KnownZeroMaskLength > 0) {
+ APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
+ APInt Op0PredMin(BitWidth, 0);
+ APInt Op0PredMax(BitWidth, 0);
+ APInt Op0MinRefinedByKnownBits(BitWidth, 0);
+ APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
+ APInt NewLower(BitWidth, 0);
+ APInt NewUpper(BitWidth, 0);
+ bool ImprovedLower = false;
+ bool ImprovedUpper = false;
+ if (I.isSigned()) {
+ Op0PredMin = Op0PredRange.getSignedMin();
+ Op0PredMax = Op0PredRange.getSignedMax();
+ // Compute the smallest number satisfying the known-bits constrained
+ // which is at greater or equal Op0PredMin.
+ Op0MinRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingSDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
+ // Compute the largest number satisfying the known-bits constrained
+ // which is at less or equal Op0PredMax.
+ Op0MaxRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingSDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
+ NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0PredMin);
+ NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0PredMax);
+ ImprovedLower = NewLower.sgt(Op0PredMin);
+ ImprovedUpper = NewUpper.slt(Op0PredMax);
+ } else {
+ Op0PredMin = Op0PredRange.getUnsignedMin();
+ Op0PredMax = Op0PredRange.getUnsignedMax();
+ Op0MinRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingUDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
+ Op0MaxRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingUDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
+ NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0PredMin);
+ NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0PredMax);
+ ImprovedLower = NewLower.ugt(Op0PredMin);
+ ImprovedUpper = NewUpper.ult(Op0PredMax);
+ }
+
+ // Non-strict inequalities should have been canonicalized to strict ones
+ // by now.
+ switch (Pred) {
+ default:
+ break;
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_SLT: {
+ if (ImprovedUpper)
+ return new ICmpInst(Pred, Op0,
+ ConstantInt::get(Op1->getType(), NewUpper + 1));
+ break;
+ }
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_SGT: {
+ if (ImprovedLower)
+ return new ICmpInst(Pred, Op0,
+ ConstantInt::get(Op1->getType(), NewLower - 1));
+ break;
+ }
+ }
+ }
+ }
return nullptr;
}
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 10ab1fe118348c..7b96f908c69c1f 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -1490,8 +1490,8 @@ define <2 x i1> @test70vec(<2 x i32> %X) {
define i1 @icmp_sext16trunc(i32 %x) {
; CHECK-LABEL: @icmp_sext16trunc(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT: [[SEXT1:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 2293761
; CHECK-NEXT: ret i1 [[CMP]]
;
%trunc = trunc i32 %x to i16
@@ -1502,8 +1502,8 @@ define i1 @icmp_sext16trunc(i32 %x) {
define i1 @icmp_sext8trunc(i32 %x) {
; CHECK-LABEL: @icmp_sext8trunc(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT: [[SEXT1:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SEXT1]], 587202561
; CHECK-NEXT: ret i1 [[CMP]]
;
%trunc = trunc i32 %x to i8
@@ -1515,8 +1515,8 @@ define i1 @icmp_sext8trunc(i32 %x) {
; Vectors should fold the same way.
define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
; CHECK-LABEL: @icmp_sext8trunc_vec(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 36, i8 36>
+; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 24, i32 24>
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], <i32 587202561, i32 587202561>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%trunc = trunc <2 x i32> %x to <2 x i8>
@@ -1527,8 +1527,8 @@ define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
define i1 @icmp_shl16(i32 %x) {
; CHECK-LABEL: @icmp_shl16(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], 36
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 16
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2293761
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i32 %x, 16
@@ -1541,7 +1541,7 @@ define i1 @icmp_shl16(i32 %x) {
define i1 @icmp_shl17(i32 %x) {
; CHECK-LABEL: @icmp_shl17(
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 17
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2359296
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 2228225
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i32 %x, 17
@@ -1551,8 +1551,8 @@ define i1 @icmp_shl17(i32 %x) {
define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
; CHECK-LABEL: @icmp_shl16_vec(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i16>
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i16> [[TMP1]], <i16 36, i16 36>
+; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 16, i32 16>
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[SHL]], <i32 2293761, i32 2293761>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%shl = shl <2 x i32> %x, <i32 16, i32 16>
@@ -1562,8 +1562,8 @@ define <2 x i1> @icmp_shl16_vec(<2 x i32> %x) {
define i1 @icmp_shl24(i32 %x) {
; CHECK-LABEL: @icmp_shl24(
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 36
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 24
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 587202561
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i32 %x, 24
@@ -2199,7 +2199,7 @@ define i1 @icmp_ashr_and_overshift(i8 %X) {
define i1 @icmp_and_ashr_neg_and_legal(i8 %x) {
; CHECK-LABEL: @icmp_and_ashr_neg_and_legal(
; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 16
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
; CHECK-NEXT: ret i1 [[CMP]]
;
%ashr = ashr i8 %x, 4
@@ -2225,7 +2225,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) {
define i1 @icmp_and_ashr_neg_cmp_slt_legal(i8 %x) {
; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_legal(
; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], -64
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], -95
; CHECK-NEXT: ret i1 [[CMP]]
;
%ashr = ashr i8 %x, 4
@@ -2239,7 +2239,7 @@ define i1 @icmp_and_ashr_neg_cmp_slt_shiftout(i8 %x) {
; CHECK-LABEL: @icmp_and_ashr_neg_cmp_slt_shiftout(
; CHECK-NEXT: [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4
; CHECK-NEXT: [[AND:%.*]] = and i8 [[ASHR]], -2
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[AND]], -68
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[AND]], -69
; CHECK-NEXT: ret i1 [[CMP]]
;
%ashr = ashr i8 %x, 4
@@ -5183,3 +5183,53 @@ entry:
%cmp = icmp eq i8 %add2, %add1
ret i1 %cmp
}
+
+define i1 @tighten_icmp_using_known_bits_ugt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_ugt(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], 15
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+entry:
+ %and_ = and i16 %a, 65532
+ %cmp = icmp ugt i16 %and_, 14
+ ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_ult(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_ult(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[AND_]], 17
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+entry:
+ %and_ = and i16 %a, 65532
+ %cmp = icmp ult i16 %and_, 18
+ ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_sgt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_sgt(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+entry:
+ %and_ = and i16 %a, 65520
+ %cmp = icmp sgt i16 %and_, -15
+ ret i1 %cmp
+}
+
+define i1 @tighten_icmp_using_known_bits_slt(i16 %a) {
+; CHECK-LABEL: @tighten_icmp_using_known_bits_slt(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[AND_]], -15
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+entry:
+ %and_ = and i16 %a, 65532
+ %cmp = icmp slt i16 %and_, -14
+ ret i1 %cmp
+}
>From ec9315d8239327b56bb0cd433ab75cc999149dff Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 00:15:03 -0500
Subject: [PATCH 2/8] Updated tests.
---
.../InstCombine/2007-10-31-RangeCrash.ll | 2 +-
.../InstCombine/assume-loop-align.ll | 2 +-
.../InstCombine/fold-signbit-test-power2.ll | 8 +++--
llvm/test/Transforms/InstCombine/icmp-mul.ll | 2 +-
llvm/test/Transforms/InstCombine/icmp-or.ll | 2 +-
.../Transforms/InstCombine/icmp-shl-nsw.ll | 4 +--
.../InstCombine/indexed-gep-compares.ll | 10 +++---
.../test/Transforms/InstCombine/opaque-ptr.ll | 4 +--
llvm/test/Transforms/InstCombine/pr17827.ll | 18 +++++-----
llvm/test/Transforms/InstCombine/pr27343.ll | 4 +--
llvm/test/Transforms/InstCombine/select.ll | 2 +-
llvm/test/Transforms/InstCombine/shift.ll | 12 +++----
.../InstCombine/shl-unsigned-cmp-const.ll | 34 +++++++++----------
13 files changed, 54 insertions(+), 50 deletions(-)
diff --git a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
index 8b472aa5af0902..d4ebeba0c86ea6 100644
--- a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
+++ b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
@@ -17,7 +17,7 @@ define i32 @test() {
; CHECK-NEXT: br label [[BB51_I_I]]
; CHECK: bb51.i.i:
; CHECK-NEXT: [[X_0_I_I]] = phi i32 [ [[TMP50_I_I]], [[BB27_I_I:%.*]] ], [ 0, [[BB_I]] ]
-; CHECK-NEXT: [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], 0
+; CHECK-NEXT: [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], -1
; CHECK-NEXT: br i1 [[TMP54_I_I]], label [[BB27_I_I]], label [[BB57_I_I:%.*]]
; CHECK: bb57.i.i:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/assume-loop-align.ll b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
index e7eb18c61b6bb0..79af1b0fede4b1 100644
--- a/llvm/test/Transforms/InstCombine/assume-loop-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-loop-align.ll
@@ -28,7 +28,7 @@ define void @foo(ptr %a, ptr %b) #0 {
; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1648
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 1633
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
index f5024664f58c3e..fc176f00486c9c 100644
--- a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
+++ b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
@@ -26,7 +26,9 @@ define i1 @pow2_or_zero_is_negative(i8 %x) {
define i1 @pow2_or_zero_is_negative_commute(i8 %A) {
; CHECK-LABEL: @pow2_or_zero_is_negative_commute(
; CHECK-NEXT: [[X:%.*]] = mul i8 [[A:%.*]], 42
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and i8 [[X]], [[NEG]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[POW2_OR_ZERO]], -1
; CHECK-NEXT: ret i1 [[CMP]]
;
%x = mul i8 42, %A ; thwart complexity-based canonicalization
@@ -54,7 +56,9 @@ define <2 x i1> @pow2_or_zero_is_negative_vec(<2 x i8> %x) {
define <2 x i1> @pow2_or_zero_is_negative_vec_commute(<2 x i8> %A) {
; CHECK-LABEL: @pow2_or_zero_is_negative_vec_commute(
; CHECK-NEXT: [[X:%.*]] = mul <2 x i8> [[A:%.*]], <i8 42, i8 42>
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
+; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and <2 x i8> [[X]], [[NEG]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[POW2_OR_ZERO]], <i8 -1, i8 -1>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%x = mul <2 x i8> <i8 42, i8 42>, %A ; thwart complexity-based canonicalization
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 7f76a94f395b60..6c6befcfe39e4f 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -969,7 +969,7 @@ define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2
; CHECK-NEXT: [[S:%.*]] = sext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]]
-; CHECK-NEXT: [[R:%.*]] = icmp sgt i32 [[M]], 254
+; CHECK-NEXT: [[R:%.*]] = icmp sgt i32 [[M]], 255
; CHECK-NEXT: ret i1 [[R]]
;
%b = and i32 %x, 2
diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll
index 922845c1e7e2d8..587df66417eb0a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-or.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-or.ll
@@ -308,7 +308,7 @@ define i1 @decrement_sgt_n1_commute_use1(i8 %px) {
; CHECK-NEXT: [[X:%.*]] = mul i8 [[PX:%.*]], 42
; CHECK-NEXT: [[DEC:%.*]] = add i8 [[X]], -1
; CHECK-NEXT: call void @use(i8 [[DEC]])
-; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X]], 0
+; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X]], 1
; CHECK-NEXT: ret i1 [[R]]
;
%x = mul i8 %px, 42 ; thwart complexity-based canonicalization
diff --git a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
index 6b9ea1f8ef97e2..5b827c839a4e40 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -136,7 +136,7 @@ define i1 @icmp_sgt6(i8 %x) {
define i1 @icmp_sgt7(i8 %x) {
; CHECK-LABEL: @icmp_sgt7(
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], 62
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 63
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl nsw i8 %x, 1
@@ -224,7 +224,7 @@ define i1 @icmp_sle1(i8 %x) {
define i1 @icmp_sle2(i8 %x) {
; CHECK-LABEL: @icmp_sle2(
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], -63
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], -64
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl nsw i8 %x, 1
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
index 2b5b3fce705354..bd32a9270f2241 100644
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -11,7 +11,7 @@ define ptr at test1(ptr %A, i32 %Offset) {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[RHS_IDX]]
@@ -40,7 +40,7 @@ define ptr at test2(i32 %A, i32 %Offset) {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[A_PTR:%.*]] = inttoptr i32 [[A:%.*]] to ptr
@@ -164,7 +164,7 @@ define ptr at test4(i16 %A, i32 %Offset) {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
@@ -203,7 +203,7 @@ define ptr at test5(i32 %Offset) personality ptr @__gxx_personality_v0 {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[RHS_IDX]]
@@ -248,7 +248,7 @@ define ptr at test6(i32 %Offset) personality ptr @__gxx_personality_v0 {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[CONT]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[A_PTR:%.*]] = inttoptr i32 [[A]] to ptr
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 4d38e2cd37c959..f92c27cd6b07df 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -387,7 +387,7 @@ define ptr @indexed_compare(ptr %A, i64 %offset) {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 400
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 403
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
@@ -416,7 +416,7 @@ define ptr @indexed_compare_different_types(ptr %A, i64 %offset) {
; CHECK: bb:
; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 4
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 800
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 803
; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
; CHECK: bb2:
; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[RHS_IDX]]
diff --git a/llvm/test/Transforms/InstCombine/pr17827.ll b/llvm/test/Transforms/InstCombine/pr17827.ll
index 6c6110aa073a59..d87909a283495e 100644
--- a/llvm/test/Transforms/InstCombine/pr17827.ll
+++ b/llvm/test/Transforms/InstCombine/pr17827.ll
@@ -6,7 +6,7 @@ define i1 @test_shift_and_cmp_not_changed1(i8 %p) {
; CHECK-LABEL: @test_shift_and_cmp_not_changed1(
; CHECK-NEXT: [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
; CHECK-NEXT: ret i1 [[CMP]]
;
%shlp = shl i8 %p, 5
@@ -20,7 +20,7 @@ define i1 @test_shift_and_cmp_not_changed2(i8 %p) {
; CHECK-LABEL: @test_shift_and_cmp_not_changed2(
; CHECK-NEXT: [[SHLP:%.*]] = ashr i8 [[P:%.*]], 5
; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
; CHECK-NEXT: ret i1 [[CMP]]
;
%shlp = ashr i8 %p, 5
@@ -35,7 +35,7 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
; CHECK-LABEL: @test_shift_and_cmp_changed1(
; CHECK-NEXT: [[ANDP:%.*]] = shl i8 [[P:%.*]], 5
; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[ANDP]], -64
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 32
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[TMP1]], 1
; CHECK-NEXT: ret i1 [[CMP]]
;
%andp = and i8 %p, 6
@@ -51,7 +51,7 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
; CHECK-LABEL: @test_shift_and_cmp_changed1_vec(
; CHECK-NEXT: [[ANDP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[ANDP]], <i8 -64, i8 -64>
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 32, i8 32>
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 1, i8 1>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%andp = and <2 x i8> %p, <i8 6, i8 6>
@@ -66,8 +66,8 @@ define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
; Unsigned compare allows a transformation to compare against 0.
define i1 @test_shift_and_cmp_changed2(i8 %p) {
; CHECK-LABEL: @test_shift_and_cmp_changed2(
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[P:%.*]], 6
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHLP]], 33
; CHECK-NEXT: ret i1 [[CMP]]
;
%shlp = shl i8 %p, 5
@@ -78,8 +78,8 @@ define i1 @test_shift_and_cmp_changed2(i8 %p) {
define <2 x i1> @test_shift_and_cmp_changed2_vec(<2 x i8> %p) {
; CHECK-LABEL: @test_shift_and_cmp_changed2_vec(
-; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[P:%.*]], <i8 6, i8 6>
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[SHLP:%.*]] = shl <2 x i8> [[P:%.*]], <i8 5, i8 5>
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i8> [[SHLP]], <i8 33, i8 33>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%shlp = shl <2 x i8> %p, <i8 5, i8 5>
@@ -93,7 +93,7 @@ define i1 @test_shift_and_cmp_changed3(i8 %p) {
; CHECK-LABEL: @test_shift_and_cmp_changed3(
; CHECK-NEXT: [[SHLP:%.*]] = shl nsw i8 [[P:%.*]], 5
; CHECK-NEXT: [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[ANDP]], 1
; CHECK-NEXT: ret i1 [[CMP]]
;
%shlp = shl nsw i8 %p, 5
diff --git a/llvm/test/Transforms/InstCombine/pr27343.ll b/llvm/test/Transforms/InstCombine/pr27343.ll
index e67d0b34056bf8..f16affde2ce41f 100644
--- a/llvm/test/Transforms/InstCombine/pr27343.ll
+++ b/llvm/test/Transforms/InstCombine/pr27343.ll
@@ -6,7 +6,7 @@ define i32 @__isnan(float %x) alwaysinline nounwind optsize {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[DOTCAST]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777215
; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
;
@@ -24,7 +24,7 @@ entry:
define i1 @icmp_shl7(i32 %x) {
; CHECK-LABEL: @icmp_shl7(
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 7
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 4481
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i32 %x, 7
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d7404..65b2c978c36c0f 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3423,7 +3423,7 @@ define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
; CHECK-LABEL: @scalable_non_zero(
; CHECK-NEXT: [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 55, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: ret <vscale x 2 x i1> [[CMP]]
;
%a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index d783adbe938632..2c2d6c921e55d1 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -515,8 +515,8 @@ define i32 @test32(i32 %A, i32 %B, i32 %C) {
define i1 @test33(i32 %X) {
; CHECK-LABEL: @test33(
-; CHECK-NEXT: [[I1_MASK:%.*]] = and i32 [[X:%.*]], 16777216
-; CHECK-NEXT: [[I2:%.*]] = icmp ne i32 [[I1_MASK]], 0
+; CHECK-NEXT: [[I1:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT: [[I2:%.*]] = icmp slt i32 [[I1]], -127
; CHECK-NEXT: ret i1 [[I2]]
;
%i1 = shl i32 %X, 7
@@ -526,8 +526,8 @@ define i1 @test33(i32 %X) {
define <2 x i1> @test33vec(<2 x i32> %X) {
; CHECK-LABEL: @test33vec(
-; CHECK-NEXT: [[I1_MASK:%.*]] = and <2 x i32> [[X:%.*]], <i32 16777216, i32 16777216>
-; CHECK-NEXT: [[I2:%.*]] = icmp ne <2 x i32> [[I1_MASK]], zeroinitializer
+; CHECK-NEXT: [[I1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 7, i32 7>
+; CHECK-NEXT: [[I2:%.*]] = icmp slt <2 x i32> [[I1]], <i32 -127, i32 -127>
; CHECK-NEXT: ret <2 x i1> [[I2]]
;
%i1 = shl <2 x i32> %X, <i32 7, i32 7>
@@ -658,8 +658,8 @@ define i8 @test39(i32 %a0) {
; CHECK-NEXT: [[I51:%.*]] = xor i8 [[I50]], [[I5]]
; CHECK-NEXT: [[TMP0:%.*]] = lshr exact i8 [[I5]], 3
; CHECK-NEXT: [[I54:%.*]] = and i8 [[TMP0]], 16
-; CHECK-NEXT: [[I551:%.*]] = or disjoint i8 [[I54]], [[I51]]
-; CHECK-NEXT: ret i8 [[I551]]
+; CHECK-NEXT: [[I55:%.*]] = or disjoint i8 [[I54]], [[I51]]
+; CHECK-NEXT: ret i8 [[I55]]
;
entry:
%i4 = trunc i32 %a0 to i8
diff --git a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
index 25b26770c366db..9e1473a621d27b 100644
--- a/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
+++ b/llvm/test/Transforms/InstCombine/shl-unsigned-cmp-const.ll
@@ -9,8 +9,8 @@
; C2 Shift amount smaller than C1 trailing zeros.
define i1 @scalar_i8_shl_ult_const_1(i8 %x) {
; CHECK-LABEL: @scalar_i8_shl_ult_const_1(
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i8 %x, 5
@@ -45,8 +45,8 @@ define i1 @scalar_i8_shl_ult_const_3(i8 %x) {
; C2 Shift amount smaller than C1 trailing zeros.
define i1 @scalar_i16_shl_ult_const(i16 %x) {
; CHECK-LABEL: @scalar_i16_shl_ult_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 252
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[X:%.*]], 8
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[SHL]], 769
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i16 %x, 8
@@ -56,8 +56,8 @@ define i1 @scalar_i16_shl_ult_const(i16 %x) {
define i1 @scalar_i32_shl_ult_const(i32 %x) {
; CHECK-LABEL: @scalar_i32_shl_ult_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 2097088
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 11
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[SHL]], 129025
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i32 %x, 11
@@ -67,8 +67,8 @@ define i1 @scalar_i32_shl_ult_const(i32 %x) {
define i1 @scalar_i64_shl_ult_const(i64 %x) {
; CHECK-LABEL: @scalar_i64_shl_ult_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 549755813632
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[X:%.*]], 25
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[SHL]], 8556380161
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i64 %x, 25
@@ -91,8 +91,8 @@ define i1 @scalar_i8_shl_uge_const(i8 %x) {
; Check 'ule' predicate
define i1 @scalar_i8_shl_ule_const(i8 %x) {
; CHECK-LABEL: @scalar_i8_shl_ule_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], 6
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i8 %x, 5
@@ -116,8 +116,8 @@ define i1 @scalar_i8_shl_ugt_const(i8 %x) {
define <4 x i1> @vector_4xi32_shl_ult_const(<4 x i32> %x) {
; CHECK-LABEL: @vector_4xi32_shl_ult_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
; CHECK-NEXT: ret <4 x i1> [[CMP]]
;
%shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -173,8 +173,8 @@ define <4 x i1> @vector_4xi32_shl_uge_const(<4 x i32> %x) {
; Check 'ule' predicate
define <4 x i1> @vector_4xi32_shl_ule_const(<4 x i32> %x) {
; CHECK-LABEL: @vector_4xi32_shl_ule_const(
-; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], <i32 2097088, i32 2097088, i32 2097088, i32 2097088>
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 129025, i32 129025, i32 129025, i32 129025>
; CHECK-NEXT: ret <4 x i1> [[CMP]]
;
%shl = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
@@ -201,7 +201,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
; CHECK-LABEL: @scalar_i8_shl_ult_const_extra_use_shl(
; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5
; CHECK-NEXT: store i8 [[SHL]], ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 64
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[SHL]], 33
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i8 %x, 5
@@ -216,7 +216,7 @@ define i1 @scalar_i8_shl_ult_const_extra_use_shl(i8 %x, ptr %p) {
define i1 @scalar_i8_shl_slt_const(i8 %x) {
; CHECK-LABEL: @scalar_i8_shl_slt_const(
; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SHL]], 64
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SHL]], 33
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i8 %x, 5
@@ -227,7 +227,7 @@ define i1 @scalar_i8_shl_slt_const(i8 %x) {
define i1 @scalar_i8_shl_ugt_const_not_power_of_2(i8 %x) {
; CHECK-LABEL: @scalar_i8_shl_ugt_const_not_power_of_2(
; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[SHL]], 66
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[SHL]], 95
; CHECK-NEXT: ret i1 [[CMP]]
;
%shl = shl i8 %x, 5
>From 91b7eb1269a04bfed17e7b8081e9dcf847f62fef Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:17:37 -0500
Subject: [PATCH 3/8] Put the transformation in a separate function. Added a
check to prevent breakin of "sign bit check" pattern.
---
.../InstCombine/InstCombineCompares.cpp | 166 ++++++++++--------
.../InstCombine/fold-signbit-test-power2.ll | 8 +-
2 files changed, 97 insertions(+), 77 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c81e229e1c59bd..8fbc3277a3fa9a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6100,6 +6100,99 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
return false;
}
+// Try to "strengthen" the RHS of compare based on known bits.
+// For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
+// it is known that the two least significant bits of `%x` is zero.
+static Instruction* strengthenICmpUsingKnownBits(
+ ICmpInst &I,
+ KnownBits Op0Known,
+ KnownBits Op1Known,
+ unsigned BitWidth
+) {
+ if (!BitWidth)
+ return nullptr;
+ if (!(Op1Known.isConstant() && Op0Known.Zero.isMask()))
+ return nullptr;
+
+ Value *Op0 = I.getOperand(0);
+ ICmpInst::Predicate Pred = I.getPredicate();
+ Type *Ty = Op0->getType();
+ APInt RHSConst = Op1Known.getConstant();
+ bool TrueIfSigned = false;
+ // Don't break the SignBitCheck pattern;
+ if (InstCombiner::isSignBitCheck(Pred, RHSConst, TrueIfSigned))
+ return nullptr;
+
+ ConstantRange Op0PredRange =
+ ConstantRange::makeExactICmpRegion(Pred, RHSConst);
+ int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
+ if (KnownZeroMaskLength == 0)
+ return nullptr;
+
+ APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
+ APInt Op0MinAccordingToPred(BitWidth, 0);
+ APInt Op0MaxAccordingToPred(BitWidth, 0);
+ APInt Op0MinRefinedByKnownBits(BitWidth, 0);
+ APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
+ APInt NewLower(BitWidth, 0);
+ APInt NewUpper(BitWidth, 0);
+ bool ImprovedLower = false;
+ bool ImprovedUpper = false;
+ if (I.isSigned()) {
+ Op0MinAccordingToPred = Op0PredRange.getSignedMin();
+ Op0MaxAccordingToPred = Op0PredRange.getSignedMax();
+ // Compute the smallest number satisfying the known-bits constrained
+ // which is at greater or equal Op0MinAccordingToPred.
+ Op0MinRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+ // Compute the largest number satisfying the known-bits constrained
+ // which is at less or equal Op0MaxAccordingToPred.
+ Op0MaxRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+ NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+ NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+ ImprovedLower = NewLower.sgt(Op0MinAccordingToPred);
+ ImprovedUpper = NewUpper.slt(Op0MaxAccordingToPred);
+ } else {
+ Op0MinAccordingToPred = Op0PredRange.getUnsignedMin();
+ Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax();
+ Op0MinRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+ Op0MaxRefinedByKnownBits =
+ PowOf2 *
+ APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+ NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
+ NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
+ ImprovedLower = NewLower.ugt(Op0MinAccordingToPred);
+ ImprovedUpper = NewUpper.ult(Op0MaxAccordingToPred);
+ }
+
+ // Non-strict inequalities should have been canonicalized to strict ones
+ // by now.
+ switch (Pred) {
+ default:
+ break;
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_SLT: {
+ if (ImprovedUpper)
+ return new ICmpInst(Pred, Op0,
+ ConstantInt::get(Ty, NewUpper + 1));
+ break;
+ }
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_SGT: {
+ if (ImprovedLower)
+ return new ICmpInst(Pred, Op0,
+ ConstantInt::get(Ty, NewLower - 1));
+ break;
+ }
+ }
+ return nullptr;
+}
+
/// Try to fold the comparison based on range information we can get by checking
/// whether bits are known to be zero or one in the inputs.
Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
@@ -6357,78 +6450,9 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
- // Try to "strengthen" the RHS of compare based on known bits.
- // For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
- // it is known that the two least significant bits of `%x` is zero.
- if (Op1Known.isConstant() && Op0Known.Zero.isMask()) {
- APInt RHSConst = Op1Known.getConstant();
- ConstantRange Op0PredRange =
- ConstantRange::makeExactICmpRegion(Pred, RHSConst);
- int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
- if (KnownZeroMaskLength > 0) {
- APInt PowOf2(BitWidth, 1 << KnownZeroMaskLength);
- APInt Op0PredMin(BitWidth, 0);
- APInt Op0PredMax(BitWidth, 0);
- APInt Op0MinRefinedByKnownBits(BitWidth, 0);
- APInt Op0MaxRefinedByKnownBits(BitWidth, 0);
- APInt NewLower(BitWidth, 0);
- APInt NewUpper(BitWidth, 0);
- bool ImprovedLower = false;
- bool ImprovedUpper = false;
- if (I.isSigned()) {
- Op0PredMin = Op0PredRange.getSignedMin();
- Op0PredMax = Op0PredRange.getSignedMax();
- // Compute the smallest number satisfying the known-bits constrained
- // which is at greater or equal Op0PredMin.
- Op0MinRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingSDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
- // Compute the largest number satisfying the known-bits constrained
- // which is at less or equal Op0PredMax.
- Op0MaxRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingSDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
- NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0PredMin);
- NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0PredMax);
- ImprovedLower = NewLower.sgt(Op0PredMin);
- ImprovedUpper = NewUpper.slt(Op0PredMax);
- } else {
- Op0PredMin = Op0PredRange.getUnsignedMin();
- Op0PredMax = Op0PredRange.getUnsignedMax();
- Op0MinRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingUDiv(Op0PredMin, PowOf2, APInt::Rounding::UP);
- Op0MaxRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingUDiv(Op0PredMax, PowOf2, APInt::Rounding::DOWN);
- NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0PredMin);
- NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0PredMax);
- ImprovedLower = NewLower.ugt(Op0PredMin);
- ImprovedUpper = NewUpper.ult(Op0PredMax);
- }
+ if (Instruction * Res = strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
+ return Res;
- // Non-strict inequalities should have been canonicalized to strict ones
- // by now.
- switch (Pred) {
- default:
- break;
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_SLT: {
- if (ImprovedUpper)
- return new ICmpInst(Pred, Op0,
- ConstantInt::get(Op1->getType(), NewUpper + 1));
- break;
- }
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_SGT: {
- if (ImprovedLower)
- return new ICmpInst(Pred, Op0,
- ConstantInt::get(Op1->getType(), NewLower - 1));
- break;
- }
- }
- }
- }
return nullptr;
}
diff --git a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
index fc176f00486c9c..f5024664f58c3e 100644
--- a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
+++ b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll
@@ -26,9 +26,7 @@ define i1 @pow2_or_zero_is_negative(i8 %x) {
define i1 @pow2_or_zero_is_negative_commute(i8 %A) {
; CHECK-LABEL: @pow2_or_zero_is_negative_commute(
; CHECK-NEXT: [[X:%.*]] = mul i8 [[A:%.*]], 42
-; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X]]
-; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and i8 [[X]], [[NEG]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[POW2_OR_ZERO]], -1
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X]], -128
; CHECK-NEXT: ret i1 [[CMP]]
;
%x = mul i8 42, %A ; thwart complexity-based canonicalization
@@ -56,9 +54,7 @@ define <2 x i1> @pow2_or_zero_is_negative_vec(<2 x i8> %x) {
define <2 x i1> @pow2_or_zero_is_negative_vec_commute(<2 x i8> %A) {
; CHECK-LABEL: @pow2_or_zero_is_negative_vec_commute(
; CHECK-NEXT: [[X:%.*]] = mul <2 x i8> [[A:%.*]], <i8 42, i8 42>
-; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
-; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and <2 x i8> [[X]], [[NEG]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[POW2_OR_ZERO]], <i8 -1, i8 -1>
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
; CHECK-NEXT: ret <2 x i1> [[CMP]]
;
%x = mul <2 x i8> <i8 42, i8 42>, %A ; thwart complexity-based canonicalization
>From be9c0c715550793b4cc57925377763dd0a661a26 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:23:54 -0500
Subject: [PATCH 4/8] updated
test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
---
.../sve-interleaved-masked-accesses.ll | 280 +++++++++---------
1 file changed, 143 insertions(+), 137 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e7f..65ee0734ebafc3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -28,45 +28,45 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALAR_TAIL_FOLDING: vector.ph:
; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALAR_TAIL_FOLDING: vector.body:
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl i32 [[INDEX]], 1
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint i32 [[TMP10]], 1
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sext i32 [[TMP15]] to i64
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -1
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]])
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -111,16 +111,18 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PREDICATED_TAIL_FOLDING: vector.ph:
; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -128,31 +130,31 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING: vector.body:
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP12]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint i32 [[TMP11]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sext i32 [[TMP17]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; PREDICATED_TAIL_FOLDING: scalar.ph:
@@ -220,36 +222,36 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALAR_TAIL_FOLDING: vector.ph:
; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALAR_TAIL_FOLDING: vector.body:
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP11]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -286,16 +288,18 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PREDICATED_TAIL_FOLDING: vector.ph:
; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -303,22 +307,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING: vector.body:
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP17]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; PREDICATED_TAIL_FOLDING: scalar.ph:
@@ -382,19 +386,19 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; SCALAR_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1039
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALAR_TAIL_FOLDING: vector.ph:
; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4
-; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP6]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4
+; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP8]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -403,18 +407,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALAR_TAIL_FOLDING: vector.body:
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP8]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP9]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP13]])
+; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -457,16 +461,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING: vector.ph:
; PREDICATED_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP3]], 1009
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 4
+; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP9]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -476,24 +482,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING: vector.body:
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP16]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; PREDICATED_TAIL_FOLDING: scalar.ph:
>From 4488bcb6c5cbd74b71e60f9e4928635358b5c009 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 25 Jan 2024 03:27:58 -0500
Subject: [PATCH 5/8] Format.
---
.../InstCombine/InstCombineCompares.cpp | 35 +++++++++----------
1 file changed, 16 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8fbc3277a3fa9a..cc09b505b5b41e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6103,12 +6103,10 @@ bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
// Try to "strengthen" the RHS of compare based on known bits.
// For example, replace `icmp ugt %x, 14` with `icmp ugt %x, 15` when
// it is known that the two least significant bits of `%x` is zero.
-static Instruction* strengthenICmpUsingKnownBits(
- ICmpInst &I,
- KnownBits Op0Known,
- KnownBits Op1Known,
- unsigned BitWidth
-) {
+static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I,
+ KnownBits Op0Known,
+ KnownBits Op1Known,
+ unsigned BitWidth) {
if (!BitWidth)
return nullptr;
if (!(Op1Known.isConstant() && Op0Known.Zero.isMask()))
@@ -6144,13 +6142,13 @@ static Instruction* strengthenICmpUsingKnownBits(
// Compute the smallest number satisfying the known-bits constrained
// which is at greater or equal Op0MinAccordingToPred.
Op0MinRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+ PowOf2 * APIntOps::RoundingSDiv(Op0MinAccordingToPred, PowOf2,
+ APInt::Rounding::UP);
// Compute the largest number satisfying the known-bits constrained
// which is at less or equal Op0MaxAccordingToPred.
Op0MaxRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+ PowOf2 * APIntOps::RoundingSDiv(Op0MaxAccordingToPred, PowOf2,
+ APInt::Rounding::DOWN);
NewLower = APIntOps::smax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
NewUpper = APIntOps::smin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
ImprovedLower = NewLower.sgt(Op0MinAccordingToPred);
@@ -6159,11 +6157,11 @@ static Instruction* strengthenICmpUsingKnownBits(
Op0MinAccordingToPred = Op0PredRange.getUnsignedMin();
Op0MaxAccordingToPred = Op0PredRange.getUnsignedMax();
Op0MinRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2, APInt::Rounding::UP);
+ PowOf2 * APIntOps::RoundingUDiv(Op0MinAccordingToPred, PowOf2,
+ APInt::Rounding::UP);
Op0MaxRefinedByKnownBits =
- PowOf2 *
- APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2, APInt::Rounding::DOWN);
+ PowOf2 * APIntOps::RoundingUDiv(Op0MaxAccordingToPred, PowOf2,
+ APInt::Rounding::DOWN);
NewLower = APIntOps::umax(Op0MinRefinedByKnownBits, Op0MinAccordingToPred);
NewUpper = APIntOps::umin(Op0MaxRefinedByKnownBits, Op0MaxAccordingToPred);
ImprovedLower = NewLower.ugt(Op0MinAccordingToPred);
@@ -6178,15 +6176,13 @@ static Instruction* strengthenICmpUsingKnownBits(
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_SLT: {
if (ImprovedUpper)
- return new ICmpInst(Pred, Op0,
- ConstantInt::get(Ty, NewUpper + 1));
+ return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewUpper + 1));
break;
}
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_SGT: {
if (ImprovedLower)
- return new ICmpInst(Pred, Op0,
- ConstantInt::get(Ty, NewLower - 1));
+ return new ICmpInst(Pred, Op0, ConstantInt::get(Ty, NewLower - 1));
break;
}
}
@@ -6450,7 +6446,8 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
- if (Instruction * Res = strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
+ if (Instruction *Res =
+ strengthenICmpUsingKnownBits(I, Op0Known, Op1Known, BitWidth))
return Res;
return nullptr;
>From 70fc18cdc2bc240ee89cfbaef7eb8f9fd222d6b6 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Fri, 26 Jan 2024 23:16:21 -0500
Subject: [PATCH 6/8] Updated tests.
---
.../InstCombine/2007-10-31-RangeCrash.ll | 2 +-
llvm/test/Transforms/InstCombine/shift.ll | 8 ++--
.../sve-interleaved-masked-accesses.ll | 44 +++++++++----------
3 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
index d4ebeba0c86ea6..8b472aa5af0902 100644
--- a/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
+++ b/llvm/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
@@ -17,7 +17,7 @@ define i32 @test() {
; CHECK-NEXT: br label [[BB51_I_I]]
; CHECK: bb51.i.i:
; CHECK-NEXT: [[X_0_I_I]] = phi i32 [ [[TMP50_I_I]], [[BB27_I_I:%.*]] ], [ 0, [[BB_I]] ]
-; CHECK-NEXT: [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], -1
+; CHECK-NEXT: [[TMP54_I_I:%.*]] = icmp slt i32 [[X_0_I_I]], 0
; CHECK-NEXT: br i1 [[TMP54_I_I]], label [[BB27_I_I]], label [[BB57_I_I:%.*]]
; CHECK: bb57.i.i:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 2c2d6c921e55d1..dda06351482fbf 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -515,8 +515,8 @@ define i32 @test32(i32 %A, i32 %B, i32 %C) {
define i1 @test33(i32 %X) {
; CHECK-LABEL: @test33(
-; CHECK-NEXT: [[I1:%.*]] = shl i32 [[X:%.*]], 7
-; CHECK-NEXT: [[I2:%.*]] = icmp slt i32 [[I1]], -127
+; CHECK-NEXT: [[I1_MASK:%.*]] = and i32 [[X:%.*]], 16777216
+; CHECK-NEXT: [[I2:%.*]] = icmp ne i32 [[I1_MASK]], 0
; CHECK-NEXT: ret i1 [[I2]]
;
%i1 = shl i32 %X, 7
@@ -526,8 +526,8 @@ define i1 @test33(i32 %X) {
define <2 x i1> @test33vec(<2 x i32> %X) {
; CHECK-LABEL: @test33vec(
-; CHECK-NEXT: [[I1:%.*]] = shl <2 x i32> [[X:%.*]], <i32 7, i32 7>
-; CHECK-NEXT: [[I2:%.*]] = icmp slt <2 x i32> [[I1]], <i32 -127, i32 -127>
+; CHECK-NEXT: [[I1_MASK:%.*]] = and <2 x i32> [[X:%.*]], <i32 16777216, i32 16777216>
+; CHECK-NEXT: [[I2:%.*]] = icmp ne <2 x i32> [[I1_MASK]], zeroinitializer
; CHECK-NEXT: ret <2 x i1> [[I2]]
;
%i1 = shl <2 x i32> %X, <i32 7, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 65ee0734ebafc3..082d8146bfce28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -132,23 +132,23 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP12]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP13]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP14]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint i32 [[TMP11]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint i32 [[TMP12]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP18]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sext i32 [[TMP17]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -1
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP14]], <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> [[TMP11]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
@@ -313,11 +313,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP14]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP17]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -485,16 +485,16 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP12]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP16]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = or disjoint <vscale x 16 x i32> [[TMP10]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP17]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP16]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
>From 82ad843714ad6ed054c2a5e4fd1a73d94acb53a5 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Thu, 1 Feb 2024 02:08:11 -0500
Subject: [PATCH 7/8] Added a check not to break any "select patterns".
---
.../Transforms/InstCombine/InstCombineCompares.cpp | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cc09b505b5b41e..c4d5bae87f6c1f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/DataLayout.h"
@@ -6117,10 +6118,22 @@ static Instruction *strengthenICmpUsingKnownBits(ICmpInst &I,
Type *Ty = Op0->getType();
APInt RHSConst = Op1Known.getConstant();
bool TrueIfSigned = false;
+
// Don't break the SignBitCheck pattern;
if (InstCombiner::isSignBitCheck(Pred, RHSConst, TrueIfSigned))
return nullptr;
+ for (const Use &U : I.uses()) {
+ const Instruction *UI = cast<Instruction>(U.getUser());
+ // Don't break any select patterns.
+ const Value *LHS;
+ const Value *RHS;
+ if (const SelectInst *Sel = dyn_cast<SelectInst>(UI)) {
+ if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
+ return nullptr;
+ }
+ }
+
ConstantRange Op0PredRange =
ConstantRange::makeExactICmpRegion(Pred, RHSConst);
int KnownZeroMaskLength = BitWidth - Op0Known.Zero.countLeadingZeros();
>From fae6accd4344096be050ca59885ccb45bd56a0ac Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at gmail.com>
Date: Mon, 5 Feb 2024 22:48:58 -0500
Subject: [PATCH 8/8] Added test to check that select and "sing bit check"
patterns are not broken.
---
llvm/test/Transforms/InstCombine/icmp.ll | 84 +++++++++++++++++++++---
1 file changed, 76 insertions(+), 8 deletions(-)
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 7b96f908c69c1f..e3a861c55649d8 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -5184,8 +5184,8 @@ entry:
ret i1 %cmp
}
-define i1 @tighten_icmp_using_known_bits_ugt(i16 %a) {
-; CHECK-LABEL: @tighten_icmp_using_known_bits_ugt(
+define i1 @strengthen_icmp_using_known_bits_ugt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_ugt(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], 15
; CHECK-NEXT: ret i1 [[CMP]]
@@ -5196,8 +5196,8 @@ entry:
ret i1 %cmp
}
-define i1 @tighten_icmp_using_known_bits_ult(i16 %a) {
-; CHECK-LABEL: @tighten_icmp_using_known_bits_ult(
+define i1 @strengthen_icmp_using_known_bits_ult(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_ult(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[AND_]], 17
@@ -5209,8 +5209,8 @@ entry:
ret i1 %cmp
}
-define i1 @tighten_icmp_using_known_bits_sgt(i16 %a) {
-; CHECK-LABEL: @tighten_icmp_using_known_bits_sgt(
+define i1 @strengthen_icmp_using_known_bits_sgt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_sgt(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], -1
; CHECK-NEXT: ret i1 [[CMP]]
@@ -5221,8 +5221,8 @@ entry:
ret i1 %cmp
}
-define i1 @tighten_icmp_using_known_bits_slt(i16 %a) {
-; CHECK-LABEL: @tighten_icmp_using_known_bits_slt(
+define i1 @strengthen_icmp_using_known_bits_slt(i16 %a) {
+; CHECK-LABEL: @strengthen_icmp_using_known_bits_slt(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[AND_:%.*]] = and i16 [[A:%.*]], -4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[AND_]], -15
@@ -5233,3 +5233,71 @@ entry:
%cmp = icmp slt i16 %and_, -14
ret i1 %cmp
}
+
+define i1 @dont_strengthen_icmp_in_sign_bit_check(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_sign_bit_check(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ICMP_:%.*]] = icmp sgt i8 [[A:%.*]], -1
+; CHECK-NEXT: ret i1 [[ICMP_]]
+;
+entry:
+ %shl_ = and i8 %a, 252
+ %icmp_ = icmp sgt i8 %shl_, -1
+ ret i1 %icmp_
+}
+
+define i8 @dont_strengthen_icmp_in_smin(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_smin(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.smin.i8(i8 [[SHL_]], i8 7)
+; CHECK-NEXT: ret i8 [[SELECT_]]
+;
+entry:
+ %shl_ = shl i8 %a, 2
+ %icmp_ = icmp slt i8 %shl_, 7
+ %select_ = select i1 %icmp_, i8 %shl_, i8 7
+ ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_umin(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_umin(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.umin.i8(i8 [[SHL_]], i8 7)
+; CHECK-NEXT: ret i8 [[SELECT_]]
+;
+entry:
+ %shl_ = shl i8 %a, 2
+ %icmp_ = icmp ult i8 %shl_, 7
+ %select_ = select i1 %icmp_, i8 %shl_, i8 7
+ ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_smax(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_smax(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.smax.i8(i8 [[SHL_]], i8 6)
+; CHECK-NEXT: ret i8 [[SELECT_]]
+;
+entry:
+ %shl_ = shl i8 %a, 2
+ %icmp_ = icmp sgt i8 %shl_, 6
+ %select_ = select i1 %icmp_, i8 %shl_, i8 6
+ ret i8 %select_
+}
+
+define i8 @dont_strengthen_icmp_in_umax(i8 %a) {
+; CHECK-LABEL: @dont_strengthen_icmp_in_umax(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHL_:%.*]] = shl i8 [[A:%.*]], 2
+; CHECK-NEXT: [[SELECT_:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL_]], i8 6)
+; CHECK-NEXT: ret i8 [[SELECT_]]
+;
+entry:
+ %shl_ = shl i8 %a, 2
+ %icmp_ = icmp ugt i8 %shl_, 6
+ %select_ = select i1 %icmp_, i8 %shl_, i8 6
+ ret i8 %select_
+}
More information about the llvm-commits
mailing list