[llvm] [VectorCombine] Fold vector sign-bit checks (PR #175194)
Valeriy Savchenko via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 9 08:20:58 PST 2026
https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/175194
>From c500cd8e845f766c009bf3c4a35b7127d402fa29 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Tue, 6 Jan 2026 15:56:22 +0000
Subject: [PATCH] [VectorCombine] Fold vector sign-bit checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Fold patterns that extract sign bits, reduce them, and compare against boundary values into direct sign checks on the reduced vector.
```
icmp pred (reduce.{add,or,and,umax,umin}(lshr X, BitWidth-1)), C
-> icmp slt/sgt (reduce.{or,umax,and,umin}(X)), 0/-1
```
When the comparison is against 0 or MAX (1 for boolean reductions, NumElts for add), the pattern reduces to one of four quantified predicates:
- ∀x: x < 0 (AllNeg)
- ∀x: x ≥ 0 (AllNonNeg)
- ∃x: x < 0 (AnyNeg)
- ∃x: x ≥ 0 (AnyNonNeg)
The transform eliminates the shift and selects between reduce.or/reduce.umax or reduce.and/reduce.umin based on cost modeling.
The matrix of Alive2 proofs for every pair of {reduction, comparison}:
| Reduction | == 0 | != 0 | == MAX | != MAX |
|-----------|------|------|--------|--------|
| or | [proof](https://alive2.llvm.org/ce/z/_BWxJW) | [proof](https://alive2.llvm.org/ce/z/k3EiK6) | [proof](https://alive2.llvm.org/ce/z/a8cAjp) | [proof](https://alive2.llvm.org/ce/z/ci-HMt) |
| umax | [proof](https://alive2.llvm.org/ce/z/dWt28G) | [proof](https://alive2.llvm.org/ce/z/_MqxXC) | [proof](https://alive2.llvm.org/ce/z/KQebnF) | [proof](https://alive2.llvm.org/ce/z/mixEgN) |
| and | [proof](https://alive2.llvm.org/ce/z/JgYrLj) | [proof](https://alive2.llvm.org/ce/z/FZuPLy) | [proof](https://alive2.llvm.org/ce/z/bYCa8V) | [proof](https://alive2.llvm.org/ce/z/9fsLsN) |
| umin | [proof](https://alive2.llvm.org/ce/z/YnaSL-) | [proof](https://alive2.llvm.org/ce/z/rGrgoM) | [proof](https://alive2.llvm.org/ce/z/pb-ezQ) | [proof](https://alive2.llvm.org/ce/z/JkoqEi) |
| add | [proof](https://alive2.llvm.org/ce/z/d5w5CF) | [proof](https://alive2.llvm.org/ce/z/GUgQ2Z) | [proof](https://alive2.llvm.org/ce/z/HnstY8) | [proof](https://alive2.llvm.org/ce/z/j8z_3C) |
| Test | Proof |
|------|-------|
| or_slt_1 (slt 1 ≡ eq 0) | [proof](https://alive2.llvm.org/ce/z/Wdb_uN) |
| umax_sgt_0 (sgt 0 ≡ ne 0) | [proof](https://alive2.llvm.org/ce/z/nw6NZc) |
| and_slt_max (slt 1 ≡ ne 1) | [proof](https://alive2.llvm.org/ce/z/ZDMSXZ) |
| umin_sgt_max_minus_1 (sgt 0 ≡ eq 1) | [proof](https://alive2.llvm.org/ce/z/Uynf8P) |
| add_ult_max (ult 4 ≡ ne 4) | [proof](https://alive2.llvm.org/ce/z/pyDgTg) |
| add_ugt_max_minus_1 (ugt 3 ≡ eq 4) | [proof](https://alive2.llvm.org/ce/z/mHVXJk) |
| ashr_add_eq_0 (ashr instead of lshr) | [proof](https://alive2.llvm.org/ce/z/oa9Kgo) |
| Check | Equivalence | Proof |
|-----------------|-------------|-------|
| AnyNeg | or slt 0 ≡ umax slt 0 | [proof](https://alive2.llvm.org/ce/z/Do2tNQ) |
| AllNonNeg | or sgt -1 ≡ umax sgt -1 | [proof](https://alive2.llvm.org/ce/z/N4kZ8Z) |
| AllNeg | and slt 0 ≡ umin slt 0 | [proof](https://alive2.llvm.org/ce/z/4mNpMk) |
| AnyNonNeg | and sgt -1 ≡ umin sgt -1 | [proof](https://alive2.llvm.org/ce/z/2pVnyg) |
---
.../Transforms/Vectorize/VectorCombine.cpp | 214 ++++++
.../fold-signbit-reduction-cmp.ll | 635 ++++++++++++++++++
2 files changed, 849 insertions(+)
create mode 100644 llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 243f685cf25e2..9a9e2ed24f17b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -145,6 +145,7 @@ class VectorCombine {
bool foldShuffleFromReductions(Instruction &I);
bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
+ bool foldSignBitReductionCmp(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
@@ -3806,6 +3807,216 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
return true;
}
+/// Fold:
+/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
+/// into:
+/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
+///
+/// Sign-bit reductions produce values with known semantics:
+/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
+/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
+/// - reduce.add: count of negative elements (0 to NumElts)
+///
+/// We transform to a direct sign check on reduce.{or,umax} or
+/// reduce.{and,umin} without explicit sign-bit extraction.
+///
+/// In spirit, it's similar to foldSignBitCheck in InstCombine.
+bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
+ CmpPredicate Pred;
+ Value *ReduceOp;
+ const APInt *CmpVal;
+ if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
+ return false;
+
+ auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
+ if (!II || !II->hasOneUse())
+ return false;
+
+ Intrinsic::ID OrigIID = II->getIntrinsicID();
+ switch (OrigIID) {
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_umax:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_add:
+ break;
+ default:
+ return false;
+ }
+
+ Value *ReductionSrc = II->getArgOperand(0);
+ if (!ReductionSrc->hasOneUse())
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
+ if (!VecTy)
+ return false;
+
+ unsigned BitWidth = VecTy->getScalarSizeInBits();
+ unsigned NumElts = VecTy->getNumElements();
+
+ // Match sign-bit extraction: shr X, (bitwidth-1)
+ Value *X;
+ Constant *C;
+ if (!match(ReductionSrc, m_Shr(m_Value(X), m_Constant(C))) ||
+ !match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_EQ,
+ APInt(BitWidth, BitWidth - 1))))
+ return false;
+
+ // MaxVal: 1 for or/and/umax/umin, NumElts for add
+ unsigned MaxVal = OrigIID == Intrinsic::vector_reduce_add ? NumElts : 1;
+
+ // In addition to direct comparisons EQ 0, NE 0, EQ 1, NE 1, etc. we support
+ // inequalities that can be interpreted as either EQ or NE considering a
+ // rather narrow range of possible value of sign-bit reductions.
+ bool IsEq;
+ uint64_t NormalizedCmpVal;
+ if (Pred == ICmpInst::ICMP_EQ) {
+ IsEq = true;
+ NormalizedCmpVal = CmpVal->getZExtValue();
+ } else if (Pred == ICmpInst::ICMP_NE) {
+ IsEq = false;
+ NormalizedCmpVal = CmpVal->getZExtValue();
+ } else if (Pred == ICmpInst::ICMP_SLT && CmpVal->isOne()) {
+ IsEq = true;
+ NormalizedCmpVal = 0; // slt 1 → eq 0
+ } else if (Pred == ICmpInst::ICMP_SGT && CmpVal->isZero()) {
+ IsEq = false;
+ NormalizedCmpVal = 0; // sgt 0 → ne 0
+ } else if ((Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT) &&
+ *CmpVal == MaxVal) {
+ IsEq = false;
+ NormalizedCmpVal = MaxVal; // s/ult MaxVal → ne MaxVal
+ } else if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT) &&
+ *CmpVal == MaxVal - 1) {
+ IsEq = true;
+ NormalizedCmpVal = MaxVal; // s/ugt MaxVal-1 → eq MaxVal
+ } else {
+ return false;
+ }
+
+ bool TestsHigh = NormalizedCmpVal == MaxVal;
+ if (NormalizedCmpVal != 0 && !TestsHigh)
+ return false;
+
+ // For this fold we support four types of checks:
+ //
+ // 1. All lanes are negative - AllNeg
+ // 2. All lanes are non-negative - AllNonNeg
+ // 3. At least one negative lane - AnyNeg
+ // 4. At least one non-negative lane - AnyNonNeg
+ //
+ // For each case, we can generate the following code:
+ //
+ // 1. AllNeg - reduce.and/umin(X) < 0
+ // 2. AllNonNeg - reduce.or/umax(X) > -1
+ // 3. AnyNeg - reduce.or/umax(X) < 0
+ // 4. AnyNonNeg - reduce.and/umin(X) > -1
+ //
+ // The table below shows the aggregation of all supported cases
+ // using these four cases.
+ //
+ // Reduction | == 0 | != 0 | == MAX | != MAX
+ // ------------+-----------+-----------+-----------+-----------
+ // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
+ // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
+ // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
+ //
+ // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
+ //
+ // For easier codegen and check inversion, we use the following encoding:
+ //
+ // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
+ // 2. Bit-2 === checks < 0 (1) or > -1 (0)
+ // 3. Bit-1 === universal (1) or existential (0) check
+ //
+ // AnyNeg = 0b110: uses or/umax, checks negative, any-check
+ // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
+ // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
+ // AllNeg = 0b011: uses and/umin, checks negative, all-check
+ //
+ // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
+ //
+ enum CheckKind : unsigned {
+ AnyNonNeg = 0b000,
+ AllNeg = 0b011,
+ AllNonNeg = 0b101,
+ AnyNeg = 0b110,
+ };
+ // Return true if we fold this check into or/umax and false for and/umin
+ auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
+ // Return true if we should check if result is negative and false otherwise
+ auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
+ // Logically invert the check
+ auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
+
+ CheckKind Base;
+ switch (OrigIID) {
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_umax:
+ Base = TestsHigh ? AnyNeg : AllNonNeg;
+ break;
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_umin:
+ Base = TestsHigh ? AllNeg : AnyNonNeg;
+ break;
+ default: // vector_reduce_add
+ Base = TestsHigh ? AllNeg : AllNonNeg;
+ break;
+ }
+
+ CheckKind Check = IsEq ? Base : Invert(Base);
+
+ // Calculate old cost: shift + reduction
+ unsigned ShiftOpcode = cast<Instruction>(ReductionSrc)->getOpcode();
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(ShiftOpcode, VecTy, CostKind);
+ unsigned OrigReductionOpc = getArithmeticReductionInstruction(OrigIID);
+ if (OrigReductionOpc != Instruction::ICmp)
+ OldCost += TTI.getArithmeticReductionCost(OrigReductionOpc, VecTy,
+ std::nullopt, CostKind);
+ else
+ OldCost +=
+ TTI.getMinMaxReductionCost(OrigIID, VecTy, FastMathFlags(), CostKind);
+
+ auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
+ InstructionCost ArithCost =
+ TTI.getArithmeticReductionCost(getArithmeticReductionInstruction(Arith),
+ VecTy, std::nullopt, CostKind);
+ InstructionCost MinMaxCost =
+ TTI.getMinMaxReductionCost(MinMax, VecTy, FastMathFlags(), CostKind);
+ return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
+ : std::make_pair(MinMax, MinMaxCost);
+ };
+
+ // Choose output reduction based on encoding's MSB
+ auto [NewIID, NewCost] = RequiresOr(Check)
+ ? PickCheaper(Intrinsic::vector_reduce_or,
+ Intrinsic::vector_reduce_umax)
+ : PickCheaper(Intrinsic::vector_reduce_and,
+ Intrinsic::vector_reduce_umin);
+
+ LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
+ << OldCost << " vs NewCost: " << NewCost << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Generate comparison based on encoding's neg bit: slt 0 for neg, sgt -1 for
+ // non-neg
+ Builder.SetInsertPoint(&I);
+ Type *ScalarTy = VecTy->getScalarType();
+ Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {X});
+ Value *NewCmp = IsNegativeCheck(Check)
+ ? Builder.CreateICmpSLT(
+ NewReduce, ConstantInt::getNullValue(ScalarTy))
+ : Builder.CreateICmpSGT(
+ NewReduce, ConstantInt::getAllOnesValue(ScalarTy));
+
+ replaceValue(I, *NewCmp);
+ return true;
+}
+
/// Returns true if this ShuffleVectorInst eventually feeds into a
/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
/// chains of shuffles and binary operators (in any combination/order).
@@ -4844,6 +5055,9 @@ bool VectorCombine::run() {
return true;
break;
case Instruction::ICmp:
+ if (foldSignBitReductionCmp(I))
+ return true;
+ [[fallthrough]];
case Instruction::FCmp:
if (foldExtractExtract(I))
return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll b/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll
new file mode 100644
index 0000000000000..a598fa7bd22f9
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll
@@ -0,0 +1,635 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=vector-combine -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -S -passes=vector-combine -mtriple=x86_64-- < %s | FileCheck %s --check-prefixes=CHECK,X86
+
+define i1 @or_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @or_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umax_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umax_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @and_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @and_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @and_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @and_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umin_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umin_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @add_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 4
+ ret i1 %cmp
+}
+
+define i1 @add_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp ne i32 %red, 4
+ ret i1 %cmp
+}
+
+define i1 @or_slt_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_slt_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_slt_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp slt i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umax_sgt_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_sgt_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_sgt_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+ %cmp = icmp sgt i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @and_slt_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_slt_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_slt_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+ %cmp = icmp slt i32 %red, 1
+ ret i1 %cmp
+}
+
+define i1 @umin_sgt_max_minus_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_sgt_max_minus_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_sgt_max_minus_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+ %cmp = icmp sgt i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_ult_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ult_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ult_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp ult i32 %red, 4
+ ret i1 %cmp
+}
+
+define i1 @add_ugt_max_minus_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ugt_max_minus_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ugt_max_minus_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp ugt i32 %red, 3
+ ret i1 %cmp
+}
+
+define i1 @ashr_add_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @ashr_add_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @ashr_add_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = ashr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_eq_0_v8i16(<8 x i16> %x) {
+; AARCH64-LABEL: define i1 @or_eq_0_v8i16(
+; AARCH64-SAME: <8 x i16> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_0_v8i16(
+; X86-SAME: <8 x i16> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <8 x i16> %x, splat (i16 15)
+ %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %shr)
+ %cmp = icmp eq i16 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @and_eq_max_v2i64(<2 x i64> %x) {
+; AARCH64-LABEL: define i1 @and_eq_max_v2i64(
+; AARCH64-SAME: <2 x i64> [[X:%.*]]) {
+; AARCH64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> [[X]])
+; AARCH64-NEXT: [[CMP:%.*]] = icmp slt i64 [[TMP1]], 0
+; AARCH64-NEXT: ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_max_v2i64(
+; X86-SAME: <2 x i64> [[X:%.*]]) {
+; X86-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[X]])
+; X86-NEXT: [[CMP:%.*]] = icmp slt i64 [[TMP1]], 0
+; X86-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <2 x i64> %x, splat (i64 63)
+ %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %shr)
+ %cmp = icmp eq i64 %red, 1
+ ret i1 %cmp
+}
+
+;============================================================================
+; Negative tests
+;============================================================================
+
+; negative: shift amount is not bitwidth-1
+define i1 @negative_wrong_shift(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_wrong_shift(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 30)
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 30)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; negative: comparison constant is neither 0 nor max
+define i1 @negative_wrong_cmp_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_wrong_cmp_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 2
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 2
+ ret i1 %cmp
+}
+
+; negative: shift has multiple uses
+define i1 @negative_multi_use_shift(<4 x i32> %x, ptr %p) {
+; CHECK-LABEL: define i1 @negative_multi_use_shift(
+; CHECK-SAME: <4 x i32> [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT: store <4 x i32> [[SHR]], ptr [[P]], align 16
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ store <4 x i32> %shr, ptr %p
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; negative: sgt with wrong constant (not 0 or max-1)
+define i1 @negative_sgt_wrong_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_sgt_wrong_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[RED]], 1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp sgt i32 %red, 1
+ ret i1 %cmp
+}
+
+; negative: slt with wrong constant (not 1 or max)
+define i1 @negative_slt_wrong_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_slt_wrong_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[RED]], 2
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+ %cmp = icmp slt i32 %red, 2
+ ret i1 %cmp
+}
More information about the llvm-commits
mailing list