[llvm] [InstCombine] Fold vector.reduce.OP(F(X)) == 0 -> OP(X) == 0 (PR #173069)

Wed Jan 7 04:43:19 PST 2026

https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/173069

>From 95eca9011fdae3987e1ad2bc585dd4ce58e69587 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Tue, 16 Dec 2025 14:52:59 +0000
Subject: [PATCH] [VectorCombine] Fold vector.reduce.OP(F(X)) == 0 -> OP(X) ==
 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit introduces an pattern to do the following fold:

  vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0

In order to decide on this fold, we use the following properties:

  1.  OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
  1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
  2.  f(x) == 0 <=> x == 0

>From 1 and 2 (or 1' and 2), we can infer that

  OP f(X_i) == 0 <=> OP X_i == 0.

For some of the OP's and f's, we need to have domain constraints on X
to ensure properties 1 (or 1') and 2.

In this change we support the following operations f:

  1. f(x) = shl nuw x, y for arbitrary y
  2. f(x) = mul nuw x, c for defined c != 0
  3. f(x) = zext x
  4. f(x) = sext x
  5. f(x) = neg x

And the following reductions OP:

  a. OR X_i   - has property 1  for every X
  b. UMAX X_i - has property 1  for every X
  c. UMIN X_i - has property 1' for every X
  d. SMAX X_i - has property 1  for X >= 0
  e. SMIN X_i - has property 1' for X >= 0
  f. ADD X_i  - has property 1  for X >= 0 && ADD X_i doesn't sign wrap

The matrix of Alive2 proofs for every pair of {f,OP}:
  | OP\f | zext | sext | neg | mul | shl |
  |------|------|------|-----|-----|-----|
  | or   | [proof](https://alive2.llvm.org/ce/z/EqHAPd) | [proof](https://alive2.llvm.org/ce/z/DS3eP2) | [proof](https://alive2.llvm.org/ce/z/65A5x9) | [proof](https://alive2.llvm.org/ce/z/TVPpUf) | [proof](https://alive2.llvm.org/ce/z/kj--vH) |
  | umin | [proof](https://alive2.llvm.org/ce/z/AK39LL) | [proof](https://alive2.llvm.org/ce/z/xEPH2S) | [proof](https://alive2.llvm.org/ce/z/N-ubNr) | [proof](https://alive2.llvm.org/ce/z/dgUEH4) | [proof](https://alive2.llvm.org/ce/z/2TUNDu) |
  | umax | [proof](https://alive2.llvm.org/ce/z/Cy_DJS) | [proof](https://alive2.llvm.org/ce/z/f42bGQ) | [proof](https://alive2.llvm.org/ce/z/ReUx4M) | [proof](https://alive2.llvm.org/ce/z/qSsvdG) | [proof](https://alive2.llvm.org/ce/z/cE3Qgw) |
  | smin | [proof](https://alive2.llvm.org/ce/z/j5TwTA) | [proof](https://alive2.llvm.org/ce/z/DhNxPQ) | — | [proof](https://alive2.llvm.org/ce/z/m03AOt) | [proof](https://alive2.llvm.org/ce/z/bp58Q3) |
  | smax | [proof](https://alive2.llvm.org/ce/z/3zmbRn) | [proof](https://alive2.llvm.org/ce/z/6FTfRJ) | — | [proof](https://alive2.llvm.org/ce/z/KDfKEW) | [proof](https://alive2.llvm.org/ce/z/dajm7T) |
  | add  | [proof](https://alive2.llvm.org/ce/z/3kt7BB) | [proof](https://alive2.llvm.org/ce/z/cyqzQH) | — | [proof](https://alive2.llvm.org/ce/z/n_oGjT) | [proof](https://alive2.llvm.org/ce/z/67bkJm) |

Proofs for known bits:
 * Leading zeros - [4vi32](https://alive2.llvm.org/ce/z/w--S2D), [16vi8](https://alive2.llvm.org/ce/z/hEdVks)
 * Leading ones - [4vi16](https://alive2.llvm.org/ce/z/RyPdBS), [v16i8](https://alive2.llvm.org/ce/z/UTFFt9)
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 195 ++++++
 .../VectorCombine/icmp-vector-reduce.ll       | 600 ++++++++++++++++++
 2 files changed, 795 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/icmp-vector-reduce.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 243f685cf25e2..15efe552a6d7c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -145,6 +145,7 @@ class VectorCombine {
   bool foldShuffleFromReductions(Instruction &I);
   bool foldShuffleChainsToReduce(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
+  bool foldICmpEqZeroVectorReduce(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
@@ -3806,6 +3807,197 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
   return true;
 }
 
+/// Check whether the constant contains a null or poison element.
+static bool containsNullOrPoison(Constant *C) {
+  const auto IsNullOrPoison = [](Constant *X) {
+    return X->isNullValue() || isa<UndefValue>(X) || isa<PoisonValue>(X);
+  };
+
+  if (auto *VecC = dyn_cast<ConstantVector>(C)) {
+    for (unsigned I = 0; I < VecC->getNumOperands(); ++I) {
+      if (IsNullOrPoison(VecC->getOperand(I)))
+        return true;
+    }
+    return false;
+  }
+
+  if (auto *DataVec = dyn_cast<ConstantDataVector>(C)) {
+    for (unsigned I = 0; I < DataVec->getNumElements(); ++I) {
+      if (IsNullOrPoison(DataVec->getElementAsConstant(I)))
+        return true;
+    }
+    return false;
+  }
+
+  return IsNullOrPoison(C);
+}
+
+bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
+  // vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
+  //
+  // We can prove it for cases when:
+  //
+  //   1.  OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
+  //   1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
+  //   2.  f(x) == 0 <=> x == 0
+  //
+  // From 1 and 2 (or 1' and 2), we can infer that
+  //
+  //   OP f(X_i) == 0 <=> OP X_i == 0.
+  //
+  // For some of the OP's and f's, we need to have domain constraints on X
+  // to ensure properties 1 (or 1') and 2.
+  CmpPredicate Pred;
+  Value *Op;
+  if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
+      !ICmpInst::isEquality(Pred))
+    return false;
+
+  auto *II = dyn_cast<IntrinsicInst>(Op);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+    break;
+  default:
+    return false;
+  }
+
+  Value *InnerOp = II->getArgOperand(0);
+
+  // TODO: fixed vector type might be too restrictive
+  if (!II->hasOneUse() || !InnerOp->hasOneUse() ||
+      !isa<FixedVectorType>(InnerOp->getType()))
+    return false;
+
+  Value *X = nullptr;
+  Constant *C = nullptr;
+
+  // Check for zero-preserving operations where f(x) = 0 <=> x = 0
+  //
+  //   1. f(x) = shl nuw x, y for arbitrary y
+  //   2. f(x) = mul nuw x, c for defined c != 0
+  //   3. f(x) = zext x
+  //   4. f(x) = sext x
+  //   5. f(x) = neg x
+  //
+  if (!(match(InnerOp, m_NUWShl(m_Value(X),
+                                m_Value())) || // Case 1
+        (match(InnerOp, m_NUWMul(m_Value(X), m_Constant(C))) &&
+         !containsNullOrPoison(C)) ||         // Case 2
+        match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
+        match(InnerOp, m_SExt(m_Value(X))) || // Case 4
+        match(InnerOp, m_Neg(m_Value(X)))     // Case 5
+        ))
+    return false;
+
+  SimplifyQuery S = SQ.getWithInstruction(&I);
+  assert(isa<FixedVectorType>(X->getType()) && "Unexpected type");
+  auto *XTy = cast<FixedVectorType>(X->getType());
+
+  // Check for domain constraints for all supported reductions.
+  //
+  //  a. OR X_i   - has property 1  for every X
+  //  b. UMAX X_i - has property 1  for every X
+  //  c. UMIN X_i - has property 1' for every X
+  //  d. SMAX X_i - has property 1  for X >= 0
+  //  e. SMIN X_i - has property 1' for X >= 0
+  //  f. ADD X_i  - has property 1  for X >= 0 && ADD X_i doesn't sign wrap
+  //
+  // In order, for the proof to work, we need 1 (or 1') to be true for both
+  // OP f(X_i) and OP X_i and that's why below we check constraints twice.
+  //
+  // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
+  //       X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
+  //       of known bits, we can't reasonably hold knowledge of "either 0
+  //       or negative".
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::vector_reduce_add: {
+    // We need to check that both X_i and f(X_i) have enough leading
+    // zeros to not overflow.
+    KnownBits KnownX = computeKnownBits(X, S);
+    KnownBits KnownFX = computeKnownBits(InnerOp, S);
+    unsigned NumElems = XTy->getNumElements();
+    // Adding N elements loses at most bit_width(N-1) leading bits.
+    unsigned LostBits = NumElems > 1 ? llvm::bit_width(NumElems - 1) : 0;
+    unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
+    unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
+    // Need at least one leading zero left after summation to ensure no overflow
+    if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
+      return false;
+
+    // We are not checking whether X or f(X) are positive explicitly because
+    // we implicitly checked for it when we checked if both cases have enough
+    // leading zeros to not wrap addition.
+    break;
+  }
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+    // Check whether X >= 0 and f(X) >= 0
+    if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
+      return false;
+
+    break;
+  default:
+    break;
+  };
+
+  // For zext/sext, check if the transform is profitable using cost model.
+  // For other operations (shl, mul, neg), we're removing an instruction
+  // while keeping the same reduction type, so it's always profitable.
+  if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
+    auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
+    Intrinsic::ID IID = II->getIntrinsicID();
+
+    InstructionCost ExtCost = TTI.getCastInstrCost(
+        cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
+        TTI::CastContextHint::None, CostKind, cast<CastInst>(InnerOp));
+
+    InstructionCost OldReduceCost, NewReduceCost;
+    switch (IID) {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_or:
+      OldReduceCost = TTI.getArithmeticReductionCost(
+          getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
+      NewReduceCost = TTI.getArithmeticReductionCost(
+          getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
+      break;
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+      OldReduceCost =
+          TTI.getMinMaxReductionCost(IID, FXTy, FastMathFlags(), CostKind);
+      NewReduceCost =
+          TTI.getMinMaxReductionCost(IID, XTy, FastMathFlags(), CostKind);
+      break;
+    default:
+      llvm_unreachable("Unexpected reduction");
+    }
+
+    InstructionCost OldCost = ExtCost + OldReduceCost;
+    InstructionCost NewCost = NewReduceCost;
+
+    if (NewCost >= OldCost)
+      return false;
+  }
+
+  // Since we support zext and sext as f, we might change the scalar type
+  // of the intrinsic.
+  Type *Ty = XTy->getScalarType();
+  Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
+  Value *NewCmp =
+      Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
+  replaceValue(I, *NewCmp);
+  return true;
+}
+
 /// Returns true if this ShuffleVectorInst eventually feeds into a
 /// vector reduction intrinsic (e.g., vector_reduce_add) by only following
 /// chains of shuffles and binary operators (in any combination/order).
@@ -4844,6 +5036,9 @@ bool VectorCombine::run() {
           return true;
         break;
       case Instruction::ICmp:
+        if (foldICmpEqZeroVectorReduce(I))
+          return true;
+        [[fallthrough]];
       case Instruction::FCmp:
         if (foldExtractExtract(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/icmp-vector-reduce.ll b/llvm/test/Transforms/VectorCombine/icmp-vector-reduce.ll
new file mode 100644
index 0000000000000..193d1cb18c5a8
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/icmp-vector-reduce.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
+
+define i1 @or_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_zext_i3(<4 x i3> %x) {
+; CHECK-LABEL: define i1 @or_zext_i3(
+; CHECK-SAME: <4 x i3> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i3 @llvm.vector.reduce.or.v4i3(<4 x i3> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i3 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i3> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sext = sext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %neg = sub <4 x i32> zeroinitializer, %x
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %mul = mul nuw <4 x i32> %x, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @or_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @or_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw <4 x i32> %x, %y
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sext = sext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %neg = sub <4 x i32> zeroinitializer, %x
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %mul = mul nuw <4 x i32> %x, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umin_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umin_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw <4 x i32> %x, %y
+  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sext = sext <4 x i16> %x to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %neg = sub <4 x i32> zeroinitializer, %x
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %mul = mul nuw <4 x i32> %x, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @umax_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umax_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw <4 x i32> %x, %y
+  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 32767)
+  %zext = zext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 32767)
+  %sext = sext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; neg is incompatible with smin constraints, expected not to combine
+define i1 @smin_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %neg = sub nsw <4 x i32> zeroinitializer, %zext
+  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smin_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smin_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smin_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 32767)
+  %zext = zext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 32767)
+  %sext = sext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; neg is incompatible with smax constraints, expected not to combine
+define i1 @smax_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %neg = sub nsw <4 x i32> zeroinitializer, %zext
+  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smax_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @smax_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smax_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 8191)
+  %zext = zext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %zext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and <4 x i16> %x, splat (i16 8191)
+  %sext = sext <4 x i16> %and to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %sext)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; neg is incompatible with add constraints, expected not to combine
+define i1 @add_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %neg = sub nsw <4 x i32> zeroinitializer, %zext
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %neg)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw <4 x i32> %zext, splat (i32 7)
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nuw <4 x i32> %zext, %ymasked
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_shl_ne(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_ne(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nuw <4 x i32> %zext, %ymasked
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+; x may be negative, expected not to combine
+define i1 @add_shl_negative(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_negative(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <4 x i32> [[X]], [[YMASKED]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nsw <4 x i32> %x, %ymasked
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; comparison with non-zero, expected not to combine
+define i1 @add_shl_nonzero_cmp(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_nonzero_cmp(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 42
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nsw <4 x i32> %zext, %ymasked
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 42
+  ret i1 %cmp
+}
+
+; shl has multiple uses, expected not to combine
+define i1 @add_shl_multiuse(<4 x i16> %x, <4 x i32> %y, ptr %p) {
+; CHECK-LABEL: define i1 @add_shl_multiuse(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT:    store <4 x i32> [[SHL]], ptr [[P]], align 16
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %ymasked = and <4 x i32> %y, splat (i32 7)
+  %shl = shl nsw <4 x i32> %zext, %ymasked
+  store <4 x i32> %shl, ptr %p
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; shift amount unbounded, expected not to combine
+define i1 @add_shl_unbounded(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_unbounded(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[Y]]
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %shl = shl nsw <4 x i32> %zext, %y
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @add_mul_nonsplat(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_nonsplat(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw <4 x i32> %zext, <i32 1, i32 2, i32 3, i32 4>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; constant has zero lane, expected not to combine
+define i1 @add_mul_zero_lane(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_zero_lane(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 1, i32 0, i32 3, i32 4>
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw <4 x i32> %zext, <i32 1, i32 0, i32 3, i32 4>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; constant has poison lane, expected not to combine
+define i1 @add_mul_poison(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_poison(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 1, i32 poison, i32 3, i32 4>
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw <4 x i32> %zext, <i32 1, i32 poison, i32 3, i32 4>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}
+
+; constant has negative lane, expected not to combine
+define i1 @add_mul_neg_const(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_neg_const(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 3, i32 -1, i32 2, i32 5>
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %zext = zext <4 x i16> %x to <4 x i32>
+  %mul = mul nuw <4 x i32> %zext, <i32 3, i32 -1, i32 2, i32 5>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+  %cmp = icmp eq i32 %red, 0
+  ret i1 %cmp
+}