[llvm] [VectorCombine] Fold vector sign-bit checks (PR #175194)

Fri Jan 9 08:20:58 PST 2026

https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/175194

>From c500cd8e845f766c009bf3c4a35b7127d402fa29 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Tue, 6 Jan 2026 15:56:22 +0000
Subject: [PATCH] [VectorCombine] Fold vector sign-bit checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fold patterns that extract sign bits, reduce them, and compare against boundary values into direct sign checks on the reduced vector.

```
icmp pred (reduce.{add,or,and,umax,umin}(lshr X, BitWidth-1)), C
    ->  icmp slt/sgt (reduce.{or,umax,and,umin}(X)), 0/-1
```

When the comparison is against 0 or MAX (1 for boolean reductions, NumElts for add), the pattern reduces to one of four quantified predicates:
- ∀x: x < 0 (AllNeg)
- ∀x: x ≥ 0 (AllNonNeg)
- ∃x: x < 0 (AnyNeg)
- ∃x: x ≥ 0 (AnyNonNeg)

The transform eliminates the shift and selects between reduce.or/reduce.umax or reduce.and/reduce.umin based on cost modeling.

The matrix of Alive2 proofs for every pair of {reduction, comparison}:

| Reduction | == 0 | != 0 | == MAX | != MAX |
|-----------|------|------|--------|--------|
| or        | [proof](https://alive2.llvm.org/ce/z/_BWxJW) | [proof](https://alive2.llvm.org/ce/z/k3EiK6) | [proof](https://alive2.llvm.org/ce/z/a8cAjp) | [proof](https://alive2.llvm.org/ce/z/ci-HMt) |
| umax      | [proof](https://alive2.llvm.org/ce/z/dWt28G) | [proof](https://alive2.llvm.org/ce/z/_MqxXC) | [proof](https://alive2.llvm.org/ce/z/KQebnF) | [proof](https://alive2.llvm.org/ce/z/mixEgN) |
| and       | [proof](https://alive2.llvm.org/ce/z/JgYrLj) | [proof](https://alive2.llvm.org/ce/z/FZuPLy) | [proof](https://alive2.llvm.org/ce/z/bYCa8V) | [proof](https://alive2.llvm.org/ce/z/9fsLsN) |
| umin      | [proof](https://alive2.llvm.org/ce/z/YnaSL-) | [proof](https://alive2.llvm.org/ce/z/rGrgoM) | [proof](https://alive2.llvm.org/ce/z/pb-ezQ) | [proof](https://alive2.llvm.org/ce/z/JkoqEi) |
| add       | [proof](https://alive2.llvm.org/ce/z/d5w5CF) | [proof](https://alive2.llvm.org/ce/z/GUgQ2Z) | [proof](https://alive2.llvm.org/ce/z/HnstY8) | [proof](https://alive2.llvm.org/ce/z/j8z_3C) |

| Test | Proof |
|------|-------|
| or_slt_1 (slt 1 ≡ eq 0) | [proof](https://alive2.llvm.org/ce/z/Wdb_uN) |
| umax_sgt_0 (sgt 0 ≡ ne 0) | [proof](https://alive2.llvm.org/ce/z/nw6NZc) |
| and_slt_max (slt 1 ≡ ne 1) | [proof](https://alive2.llvm.org/ce/z/ZDMSXZ) |
| umin_sgt_max_minus_1 (sgt 0 ≡ eq 1) | [proof](https://alive2.llvm.org/ce/z/Uynf8P) |
| add_ult_max (ult 4 ≡ ne 4) | [proof](https://alive2.llvm.org/ce/z/pyDgTg) |
| add_ugt_max_minus_1 (ugt 3 ≡ eq 4) | [proof](https://alive2.llvm.org/ce/z/mHVXJk) |
| ashr_add_eq_0 (ashr instead of lshr) | [proof](https://alive2.llvm.org/ce/z/oa9Kgo) |

| Check | Equivalence | Proof |
|-----------------|-------------|-------|
| AnyNeg | or slt 0 ≡ umax slt 0 | [proof](https://alive2.llvm.org/ce/z/Do2tNQ) |
| AllNonNeg | or sgt -1 ≡ umax sgt -1 | [proof](https://alive2.llvm.org/ce/z/N4kZ8Z) |
| AllNeg | and slt 0 ≡ umin slt 0 | [proof](https://alive2.llvm.org/ce/z/4mNpMk) |
| AnyNonNeg | and sgt -1 ≡ umin sgt -1 | [proof](https://alive2.llvm.org/ce/z/2pVnyg) |
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 214 ++++++
 .../fold-signbit-reduction-cmp.ll             | 635 ++++++++++++++++++
 2 files changed, 849 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 243f685cf25e2..9a9e2ed24f17b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -145,6 +145,7 @@ class VectorCombine {
   bool foldShuffleFromReductions(Instruction &I);
   bool foldShuffleChainsToReduce(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
+  bool foldSignBitReductionCmp(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
@@ -3806,6 +3807,216 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
   return true;
 }
 
+/// Fold:
+///   icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
+/// into:
+///   icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
+///
+/// Sign-bit reductions produce values with known semantics:
+///   - reduce.{or,umax}: 0 if no element is negative, 1 if any is
+///   - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
+///   - reduce.add: count of negative elements (0 to NumElts)
+///
+/// We transform to a direct sign check on reduce.{or,umax} or
+/// reduce.{and,umin} without explicit sign-bit extraction.
+///
+/// In spirit, it's similar to foldSignBitCheck in InstCombine.
+bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
+  CmpPredicate Pred;
+  Value *ReduceOp;
+  const APInt *CmpVal;
+  if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
+    return false;
+
+  auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
+  if (!II || !II->hasOneUse())
+    return false;
+
+  Intrinsic::ID OrigIID = II->getIntrinsicID();
+  switch (OrigIID) {
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_add:
+    break;
+  default:
+    return false;
+  }
+
+  Value *ReductionSrc = II->getArgOperand(0);
+  if (!ReductionSrc->hasOneUse())
+    return false;
+
+  auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
+  if (!VecTy)
+    return false;
+
+  unsigned BitWidth = VecTy->getScalarSizeInBits();
+  unsigned NumElts = VecTy->getNumElements();
+
+  // Match sign-bit extraction: shr X, (bitwidth-1)
+  Value *X;
+  Constant *C;
+  if (!match(ReductionSrc, m_Shr(m_Value(X), m_Constant(C))) ||
+      !match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_EQ,
+                                   APInt(BitWidth, BitWidth - 1))))
+    return false;
+
+  // MaxVal: 1 for or/and/umax/umin, NumElts for add
+  unsigned MaxVal = OrigIID == Intrinsic::vector_reduce_add ? NumElts : 1;
+
+  // In addition to direct comparisons EQ 0, NE 0, EQ 1, NE 1, etc. we support
+  // inequalities that can be interpreted as either EQ or NE considering a
+  // rather narrow range of possible value of sign-bit reductions.
+  bool IsEq;
+  uint64_t NormalizedCmpVal;
+  if (Pred == ICmpInst::ICMP_EQ) {
+    IsEq = true;
+    NormalizedCmpVal = CmpVal->getZExtValue();
+  } else if (Pred == ICmpInst::ICMP_NE) {
+    IsEq = false;
+    NormalizedCmpVal = CmpVal->getZExtValue();
+  } else if (Pred == ICmpInst::ICMP_SLT && CmpVal->isOne()) {
+    IsEq = true;
+    NormalizedCmpVal = 0; // slt 1 → eq 0
+  } else if (Pred == ICmpInst::ICMP_SGT && CmpVal->isZero()) {
+    IsEq = false;
+    NormalizedCmpVal = 0; // sgt 0 → ne 0
+  } else if ((Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT) &&
+             *CmpVal == MaxVal) {
+    IsEq = false;
+    NormalizedCmpVal = MaxVal; // s/ult MaxVal → ne MaxVal
+  } else if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT) &&
+             *CmpVal == MaxVal - 1) {
+    IsEq = true;
+    NormalizedCmpVal = MaxVal; // s/ugt MaxVal-1 → eq MaxVal
+  } else {
+    return false;
+  }
+
+  bool TestsHigh = NormalizedCmpVal == MaxVal;
+  if (NormalizedCmpVal != 0 && !TestsHigh)
+    return false;
+
+  // For this fold we support four types of checks:
+  //
+  //   1. All lanes are negative - AllNeg
+  //   2. All lanes are non-negative - AllNonNeg
+  //   3. At least one negative lane - AnyNeg
+  //   4. At least one non-negative lane - AnyNonNeg
+  //
+  // For each case, we can generate the following code:
+  //
+  //   1. AllNeg    - reduce.and/umin(X) < 0
+  //   2. AllNonNeg -  reduce.or/umax(X) > -1
+  //   3. AnyNeg    -  reduce.or/umax(X) < 0
+  //   4. AnyNonNeg - reduce.and/umin(X) > -1
+  //
+  // The table below shows the aggregation of all supported cases
+  // using these four cases.
+  //
+  //   Reduction   | == 0      | != 0      | == MAX    | != MAX
+  //   ------------+-----------+-----------+-----------+-----------
+  //   or/umax     | AllNonNeg | AnyNeg    | AnyNeg    | AllNonNeg
+  //   and/umin    | AnyNonNeg | AllNeg    | AllNeg    | AnyNonNeg
+  //   add         | AllNonNeg | AnyNeg    | AllNeg    | AnyNonNeg
+  //
+  // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
+  //
+  // For easier codegen and check inversion, we use the following encoding:
+  //
+  //   1. Bit-3 === requires or/umax (1) or and/umin (0) check
+  //   2. Bit-2 === checks < 0 (1) or > -1 (0)
+  //   3. Bit-1 === universal (1) or existential (0) check
+  //
+  //   AnyNeg    = 0b110: uses or/umax,  checks negative, any-check
+  //   AllNonNeg = 0b101: uses or/umax,  checks non-neg,  all-check
+  //   AnyNonNeg = 0b000: uses and/umin, checks non-neg,  any-check
+  //   AllNeg    = 0b011: uses and/umin, checks negative, all-check
+  //
+  // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
+  //
+  enum CheckKind : unsigned {
+    AnyNonNeg = 0b000,
+    AllNeg = 0b011,
+    AllNonNeg = 0b101,
+    AnyNeg = 0b110,
+  };
+  // Return true if we fold this check into or/umax and false for and/umin
+  auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
+  // Return true if we should check if result is negative and false otherwise
+  auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
+  // Logically invert the check
+  auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
+
+  CheckKind Base;
+  switch (OrigIID) {
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_umax:
+    Base = TestsHigh ? AnyNeg : AllNonNeg;
+    break;
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_umin:
+    Base = TestsHigh ? AllNeg : AnyNonNeg;
+    break;
+  default: // vector_reduce_add
+    Base = TestsHigh ? AllNeg : AllNonNeg;
+    break;
+  }
+
+  CheckKind Check = IsEq ? Base : Invert(Base);
+
+  // Calculate old cost: shift + reduction
+  unsigned ShiftOpcode = cast<Instruction>(ReductionSrc)->getOpcode();
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(ShiftOpcode, VecTy, CostKind);
+  unsigned OrigReductionOpc = getArithmeticReductionInstruction(OrigIID);
+  if (OrigReductionOpc != Instruction::ICmp)
+    OldCost += TTI.getArithmeticReductionCost(OrigReductionOpc, VecTy,
+                                              std::nullopt, CostKind);
+  else
+    OldCost +=
+        TTI.getMinMaxReductionCost(OrigIID, VecTy, FastMathFlags(), CostKind);
+
+  auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
+    InstructionCost ArithCost =
+        TTI.getArithmeticReductionCost(getArithmeticReductionInstruction(Arith),
+                                       VecTy, std::nullopt, CostKind);
+    InstructionCost MinMaxCost =
+        TTI.getMinMaxReductionCost(MinMax, VecTy, FastMathFlags(), CostKind);
+    return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
+                                   : std::make_pair(MinMax, MinMaxCost);
+  };
+
+  // Choose output reduction based on encoding's MSB
+  auto [NewIID, NewCost] = RequiresOr(Check)
+                               ? PickCheaper(Intrinsic::vector_reduce_or,
+                                             Intrinsic::vector_reduce_umax)
+                               : PickCheaper(Intrinsic::vector_reduce_and,
+                                             Intrinsic::vector_reduce_umin);
+
+  LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n  OldCost: "
+                    << OldCost << " vs NewCost: " << NewCost << "\n");
+
+  if (NewCost > OldCost)
+    return false;
+
+  // Generate comparison based on encoding's neg bit: slt 0 for neg, sgt -1 for
+  // non-neg
+  Builder.SetInsertPoint(&I);
+  Type *ScalarTy = VecTy->getScalarType();
+  Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {X});
+  Value *NewCmp = IsNegativeCheck(Check)
+                      ? Builder.CreateICmpSLT(
+                            NewReduce, ConstantInt::getNullValue(ScalarTy))
+                      : Builder.CreateICmpSGT(
+                            NewReduce, ConstantInt::getAllOnesValue(ScalarTy));
+
+  replaceValue(I, *NewCmp);
+  return true;
+}
+
 /// Returns true if this ShuffleVectorInst eventually feeds into a
 /// vector reduction intrinsic (e.g., vector_reduce_add) by only following
 /// chains of shuffles and binary operators (in any combination/order).
@@ -4844,6 +5055,9 @@ bool VectorCombine::run() {
           return true;
         break;
       case Instruction::ICmp:
+        if (foldSignBitReductionCmp(I))
+          return true;
+        [[fallthrough]];
       case Instruction::FCmp:
         if (foldExtractExtract(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll b/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll
new file mode 100644
index 0000000000000..a598fa7bd22f9
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-signbit-reduction-cmp.ll
@@ -0,0 +1,635 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=vector-combine -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -S -passes=vector-combine -mtriple=x86_64-- < %s | FileCheck %s --check-prefixes=CHECK,X86
+
+define i1 @or_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @or_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umax_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umax_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @and_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @and_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @and_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @and_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umin_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umin_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @add_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_ne_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ne_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ne_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_eq_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_eq_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_eq_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 4
+  ret i1 %cmp
+}
+
+define i1 @add_ne_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ne_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ne_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp ne i32 %red, 4
+  ret i1 %cmp
+}
+
+define i1 @or_slt_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @or_slt_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_slt_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+  %cmp = icmp slt i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umax_sgt_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umax_sgt_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umax_sgt_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shr)
+  %cmp = icmp sgt i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @and_slt_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @and_slt_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_slt_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shr)
+  %cmp = icmp slt i32 %red, 1
+  ret i1 %cmp
+}
+
+define i1 @umin_sgt_max_minus_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @umin_sgt_max_minus_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @umin_sgt_max_minus_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shr)
+  %cmp = icmp sgt i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_ult_max(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ult_max(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ult_max(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp ult i32 %red, 4
+  ret i1 %cmp
+}
+
+define i1 @add_ugt_max_minus_1(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @add_ugt_max_minus_1(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @add_ugt_max_minus_1(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp ugt i32 %red, 3
+  ret i1 %cmp
+}
+
+define i1 @ashr_add_eq_0(<4 x i32> %x) {
+; AARCH64-LABEL: define i1 @ashr_add_eq_0(
+; AARCH64-SAME: <4 x i32> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @ashr_add_eq_0(
+; X86-SAME: <4 x i32> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = ashr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_eq_0_v8i16(<8 x i16> %x) {
+; AARCH64-LABEL: define i1 @or_eq_0_v8i16(
+; AARCH64-SAME: <8 x i16> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @or_eq_0_v8i16(
+; X86-SAME: <8 x i16> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <8 x i16> %x, splat (i16 15)
+  %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %shr)
+  %cmp = icmp eq i16 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @and_eq_max_v2i64(<2 x i64> %x) {
+; AARCH64-LABEL: define i1 @and_eq_max_v2i64(
+; AARCH64-SAME: <2 x i64> [[X:%.*]]) {
+; AARCH64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> [[X]])
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP1]], 0
+; AARCH64-NEXT:    ret i1 [[CMP]]
+;
+; X86-LABEL: define i1 @and_eq_max_v2i64(
+; X86-SAME: <2 x i64> [[X:%.*]]) {
+; X86-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[X]])
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP1]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <2 x i64> %x, splat (i64 63)
+  %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %shr)
+  %cmp = icmp eq i64 %red, 1
+  ret i1 %cmp
+}
+
+;============================================================================
+; Negative tests
+;============================================================================
+
+; negative: shift amount is not bitwidth-1
+define i1 @negative_wrong_shift(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_wrong_shift(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 30)
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 30)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; negative: comparison constant is neither 0 nor max
+define i1 @negative_wrong_cmp_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_wrong_cmp_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 2
+  ret i1 %cmp
+}
+
+; negative: shift has multiple uses
+define i1 @negative_multi_use_shift(<4 x i32> %x, ptr %p) {
+; CHECK-LABEL: define i1 @negative_multi_use_shift(
+; CHECK-SAME: <4 x i32> [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT:    store <4 x i32> [[SHR]], ptr [[P]], align 16
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  store <4 x i32> %shr, ptr %p
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; negative: sgt with wrong constant (not 0 or max-1)
+define i1 @negative_sgt_wrong_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_sgt_wrong_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RED]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp sgt i32 %red, 1
+  ret i1 %cmp
+}
+
+; negative: slt with wrong constant (not 1 or max)
+define i1 @negative_slt_wrong_const(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @negative_slt_wrong_const(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 31)
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RED]], 2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr <4 x i32> %x, splat (i32 31)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shr)
+  %cmp = icmp slt i32 %red, 2
+  ret i1 %cmp
+}