[llvm] [VectorCombine] Fold vector.reduce.OP(F(X)) == 0 -> OP(X) == 0 (PR #173069)
Valeriy Savchenko via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 23 10:09:46 PST 2026
https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/173069
>From 3c6f2a71e84151ecbdd9fc0b87fe58bf69cf0489 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Tue, 16 Dec 2025 14:52:59 +0000
Subject: [PATCH] [VectorCombine] Fold vector.reduce.OP(F(X)) == 0 -> OP(X) ==
0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This commit introduces an pattern to do the following fold:
vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
In order to decide on this fold, we use the following properties:
1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
2. f(x) == 0 <=> x == 0
>From 1 and 2 (or 1' and 2), we can infer that
OP f(X_i) == 0 <=> OP X_i == 0.
For some of the OP's and f's, we need to have domain constraints on X
to ensure properties 1 (or 1') and 2.
In this change we support the following operations f:
1. f(x) = shl nuw x, y for arbitrary y
2. f(x) = mul nuw x, c for defined c != 0
3. f(x) = zext x
4. f(x) = sext x
5. f(x) = neg x
And the following reductions OP:
a. OR X_i - has property 1 for every X
b. UMAX X_i - has property 1 for every X
c. UMIN X_i - has property 1' for every X
d. SMAX X_i - has property 1 for X >= 0
e. SMIN X_i - has property 1' for X >= 0
f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
The matrix of Alive2 proofs for every pair of {f,OP}:
| OP\f | zext | sext | neg | mul | shl |
|------|------|------|-----|-----|-----|
| or | [proof](https://alive2.llvm.org/ce/z/EqHAPd) | [proof](https://alive2.llvm.org/ce/z/DS3eP2) | [proof](https://alive2.llvm.org/ce/z/65A5x9) | [proof](https://alive2.llvm.org/ce/z/TVPpUf) | [proof](https://alive2.llvm.org/ce/z/kj--vH) |
| umin | [proof](https://alive2.llvm.org/ce/z/AK39LL) | [proof](https://alive2.llvm.org/ce/z/xEPH2S) | [proof](https://alive2.llvm.org/ce/z/N-ubNr) | [proof](https://alive2.llvm.org/ce/z/dgUEH4) | [proof](https://alive2.llvm.org/ce/z/2TUNDu) |
| umax | [proof](https://alive2.llvm.org/ce/z/Cy_DJS) | [proof](https://alive2.llvm.org/ce/z/f42bGQ) | [proof](https://alive2.llvm.org/ce/z/ReUx4M) | [proof](https://alive2.llvm.org/ce/z/qSsvdG) | [proof](https://alive2.llvm.org/ce/z/cE3Qgw) |
| smin | [proof](https://alive2.llvm.org/ce/z/j5TwTA) | [proof](https://alive2.llvm.org/ce/z/DhNxPQ) | — | [proof](https://alive2.llvm.org/ce/z/m03AOt) | [proof](https://alive2.llvm.org/ce/z/bp58Q3) |
| smax | [proof](https://alive2.llvm.org/ce/z/3zmbRn) | [proof](https://alive2.llvm.org/ce/z/6FTfRJ) | — | [proof](https://alive2.llvm.org/ce/z/KDfKEW) | [proof](https://alive2.llvm.org/ce/z/dajm7T) |
| add | [proof](https://alive2.llvm.org/ce/z/3kt7BB) | [proof](https://alive2.llvm.org/ce/z/cyqzQH) | — | [proof](https://alive2.llvm.org/ce/z/n_oGjT) | [proof](https://alive2.llvm.org/ce/z/67bkJm) |
Proofs for known bits:
* Leading zeros - [4vi32](https://alive2.llvm.org/ce/z/w--S2D), [16vi8](https://alive2.llvm.org/ce/z/hEdVks)
* Leading ones - [4vi16](https://alive2.llvm.org/ce/z/RyPdBS), [v16i8](https://alive2.llvm.org/ce/z/UTFFt9)
---
llvm/include/llvm/IR/PatternMatch.h | 9 +
.../Transforms/Vectorize/VectorCombine.cpp | 183 +++++
.../AArch64/icmp-vector-reduce.ll | 672 ++++++++++++++++++
.../VectorCombine/X86/icmp-vector-reduce.ll | 672 ++++++++++++++++++
4 files changed, 1536 insertions(+)
create mode 100644 llvm/test/Transforms/VectorCombine/AArch64/icmp-vector-reduce.ll
create mode 100644 llvm/test/Transforms/VectorCombine/X86/icmp-vector-reduce.ll
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 9bed9f2207d79..3f56de3bf1fb0 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -606,6 +606,15 @@ inline cst_pred_ty<is_zero_int> m_ZeroInt() {
return cst_pred_ty<is_zero_int>();
}
+struct is_non_zero_int {
+ bool isValue(const APInt &C) const { return !C.isZero(); }
+};
+/// Match a non-zero integer or a vector with all non-zero elements.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_non_zero_int> m_NonZeroInt() {
+ return cst_pred_ty<is_non_zero_int>();
+}
+
struct is_zero {
template <typename ITy> bool match(ITy *V) const {
auto *C = dyn_cast<Constant>(V);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a103b8ddaf608..519c0b44bf223 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -33,6 +33,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
@@ -148,6 +149,7 @@ class VectorCombine {
bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
bool foldSignBitReductionCmp(Instruction &I);
+ bool foldICmpEqZeroVectorReduce(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
@@ -4367,7 +4369,186 @@ bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {X});
Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
: Builder.CreateIsNotNeg(NewReduce);
+ replaceValue(I, *NewCmp);
+ return true;
+}
+
+/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
+///
+/// We can prove it for cases when:
+///
+/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
+/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
+/// 2. f(x) == 0 <=> x == 0
+///
+/// From 1 and 2 (or 1' and 2), we can infer that
+///
+/// OP f(X_i) == 0 <=> OP X_i == 0.
+///
+/// (1)
+/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
+/// (2)
+/// <=> \forall i \in [1, N] X_i == 0
+/// (1)
+/// <=> OP(X_i) == 0
+///
+/// For some of the OP's and f's, we need to have domain constraints on X
+/// to ensure properties 1 (or 1') and 2.
+bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
+ CmpPredicate Pred;
+ Value *Op;
+ if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
+ !ICmpInst::isEquality(Pred))
+ return false;
+
+ auto *II = dyn_cast<IntrinsicInst>(Op);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ break;
+ default:
+ return false;
+ }
+
+ Value *InnerOp = II->getArgOperand(0);
+ // TODO: fixed vector type might be too restrictive
+ if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
+ return false;
+
+ Value *X = nullptr;
+
+ // Check for zero-preserving operations where f(x) = 0 <=> x = 0
+ //
+ // 1. f(x) = shl nuw x, y for arbitrary y
+ // 2. f(x) = mul nuw x, c for defined c != 0
+ // 3. f(x) = zext x
+ // 4. f(x) = sext x
+ // 5. f(x) = neg x
+ //
+ if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
+ match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
+ match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
+ match(InnerOp, m_SExt(m_Value(X))) || // Case 4
+ match(InnerOp, m_Neg(m_Value(X))) // Case 5
+ ))
+ return false;
+
+ SimplifyQuery S = SQ.getWithInstruction(&I);
+ auto *XTy = cast<FixedVectorType>(X->getType());
+
+ // Check for domain constraints for all supported reductions.
+ //
+ // a. OR X_i - has property 1 for every X
+ // b. UMAX X_i - has property 1 for every X
+ // c. UMIN X_i - has property 1' for every X
+ // d. SMAX X_i - has property 1 for X >= 0
+ // e. SMIN X_i - has property 1' for X >= 0
+ // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
+ //
+ // In order for the proof to work, we need 1 (or 1') to be true for both
+ // OP f(X_i) and OP X_i and that's why below we check constraints twice.
+ //
+ // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
+ // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
+ // of known bits, we can't reasonably hold knowledge of "either 0
+ // or negative".
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::vector_reduce_add: {
+ // We need to check that both X_i and f(X_i) have enough leading
+ // zeros to not overflow.
+ KnownBits KnownX = computeKnownBits(X, S);
+ KnownBits KnownFX = computeKnownBits(InnerOp, S);
+ unsigned NumElems = XTy->getNumElements();
+ // Adding N elements loses at most ceil(log2(N)) leading bits.
+ unsigned LostBits = Log2_32_Ceil(NumElems);
+ unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
+ unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
+ // Need at least one leading zero left after summation to ensure no overflow
+ if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
+ return false;
+
+ // We are not checking whether X or f(X) are positive explicitly because
+ // we implicitly checked for it when we checked if both cases have enough
+ // leading zeros to not wrap addition.
+ break;
+ }
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ // Check whether X >= 0 and f(X) >= 0
+ if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
+ return false;
+
+ break;
+ default:
+ break;
+ };
+
+ LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
+ << *II << "\n");
+
+ // For zext/sext, check if the transform is profitable using cost model.
+ // For other operations (shl, mul, neg), we're removing an instruction
+ // while keeping the same reduction type, so it's always profitable.
+ if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
+ auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
+ Intrinsic::ID IID = II->getIntrinsicID();
+
+ InstructionCost ExtCost = TTI.getCastInstrCost(
+ cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
+ TTI::CastContextHint::None, CostKind, cast<CastInst>(InnerOp));
+
+ InstructionCost OldReduceCost, NewReduceCost;
+ switch (IID) {
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_or:
+ OldReduceCost = TTI.getArithmeticReductionCost(
+ getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
+ NewReduceCost = TTI.getArithmeticReductionCost(
+ getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
+ break;
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ OldReduceCost =
+ TTI.getMinMaxReductionCost(IID, FXTy, FastMathFlags(), CostKind);
+ NewReduceCost =
+ TTI.getMinMaxReductionCost(IID, XTy, FastMathFlags(), CostKind);
+ break;
+ default:
+ llvm_unreachable("Unexpected reduction");
+ }
+
+ InstructionCost OldCost = OldReduceCost + ExtCost;
+ InstructionCost NewCost =
+ NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
+
+ LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
+ << *InnerOp << "\n OldCost: " << OldCost
+ << " vs NewCost: " << NewCost << "\n");
+
+ // We consider transformation to still be potentially beneficial even
+ // when the costs are the same because we might remove a use from f(X)
+ // and unlock other optimizations. Equal costs would just mean that we
+ // didn't make it worse in the worst case.
+ if (NewCost > OldCost)
+ return false;
+ }
+
+ // Since we support zext and sext as f, we might change the scalar type
+ // of the intrinsic.
+ Type *Ty = XTy->getScalarType();
+ Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
+ Value *NewCmp =
+ Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
replaceValue(I, *NewCmp);
return true;
}
@@ -5406,6 +5587,8 @@ bool VectorCombine::run() {
case Instruction::ICmp:
if (foldSignBitReductionCmp(I))
return true;
+ if (foldICmpEqZeroVectorReduce(I))
+ return true;
[[fallthrough]];
case Instruction::FCmp:
if (foldExtractExtract(I))
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/icmp-vector-reduce.ll b/llvm/test/Transforms/VectorCombine/AArch64/icmp-vector-reduce.ll
new file mode 100644
index 0000000000000..6fa19fe673b3a
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/icmp-vector-reduce.ll
@@ -0,0 +1,672 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -S -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define i1 @or_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[RED:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_zext_i3(<4 x i3> %x) {
+; CHECK-LABEL: define i1 @or_zext_i3(
+; CHECK-SAME: <4 x i3> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i3 @llvm.vector.reduce.or.v4i3(<4 x i3> [[X]])
+; CHECK-NEXT: [[RED:%.*]] = zext i3 [[TMP1]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i3> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_zext_v3_costly(<3 x i8> %x) {
+; CHECK-LABEL: define i1 @or_zext_v3_costly(
+; CHECK-SAME: <3 x i8> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <3 x i8> %x to <3 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[RED:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @or_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umin_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umax_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with smin constraints, expected not to combine
+define i1 @smin_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smin_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with smax constraints, expected not to combine
+define i1 @smax_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smax_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 8191)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 8191)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with add constraints, expected not to combine
+define i1 @add_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_shl_ne(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_ne(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+; x may be negative, expected not to combine
+define i1 @add_shl_negative(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_negative(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[X]], [[YMASKED]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %x, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; comparison with non-zero, expected not to combine
+define i1 @add_shl_nonzero_cmp(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_nonzero_cmp(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 42
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 42
+ ret i1 %cmp
+}
+
+; shl has multiple uses, expected not to combine
+define i1 @add_shl_multiuse(<4 x i16> %x, <4 x i32> %y, ptr %p) {
+; CHECK-LABEL: define i1 @add_shl_multiuse(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT: store <4 x i32> [[SHL]], ptr [[P]], align 16
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %zext, %ymasked
+ store <4 x i32> %shl, ptr %p
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; shift amount unbounded, expected not to combine
+define i1 @add_shl_unbounded(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_unbounded(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[Y]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %shl = shl nsw <4 x i32> %zext, %y
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul_nonsplat(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_nonsplat(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 2, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul_poison(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_poison(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 poison, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; constant has zero lane, expected not to combine
+define i1 @add_mul_zero_lane(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_zero_lane(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 1, i32 0, i32 3, i32 4>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 0, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; constant has negative lane, expected not to combine
+define i1 @add_mul_neg_const(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_neg_const(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 3, i32 -1, i32 2, i32 5>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 3, i32 -1, i32 2, i32 5>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+declare void @foo(<4 x i32>)
+
+define void @or_zext_two_blocks(<4 x i16> %x) {
+; CHECK-LABEL: define void @or_zext_two_blocks(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: call void @foo(<4 x i32> [[A]])
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+ %cmp = icmp eq i32 %red, 0
+ br i1 %cmp, label %then, label %exit
+
+then:
+ call void @foo(<4 x i32> %a)
+ br label %exit
+
+exit:
+ ret void
+}
+
+define void @or_shl_two_blocks(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define void @or_shl_two_blocks(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = shl nuw <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: call void @foo(<4 x i32> [[A]])
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+ %cmp = icmp eq i32 %red, 0
+ br i1 %cmp, label %then, label %exit
+
+then:
+ call void @foo(<4 x i32> %a)
+ br label %exit
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/icmp-vector-reduce.ll b/llvm/test/Transforms/VectorCombine/X86/icmp-vector-reduce.ll
new file mode 100644
index 0000000000000..24f5498202309
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/icmp-vector-reduce.ll
@@ -0,0 +1,672 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i1 @or_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_zext_i3(<4 x i3> %x) {
+; CHECK-LABEL: define i1 @or_zext_i3(
+; CHECK-SAME: <4 x i3> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i3 @llvm.vector.reduce.or.v4i3(<4 x i3> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i3 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i3> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; negative: costly for X86
+define i1 @or_zext_v3_costly(<3 x i8> %x) {
+; CHECK-LABEL: define i1 @or_zext_v3_costly(
+; CHECK-SAME: <3 x i8> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <3 x i8> [[X]] to <3 x i32>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <3 x i8> %x to <3 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @or_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[RED:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @or_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @or_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @or_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umin_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umin_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umin_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @umax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %sext = sext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_neg(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_neg(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %neg = sub <4 x i32> zeroinitializer, %x
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_mul(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @umax_mul(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %mul = mul nuw <4 x i32> %x, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @umax_shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @umax_shl(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shl = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with smin constraints, expected not to combine
+define i1 @smin_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smin_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smin_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smin_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 32767)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 32767)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with smax constraints, expected not to combine
+define i1 @smax_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @smax_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw nsw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @smax_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @smax_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_zext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_zext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 8191)
+ %zext = zext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %zext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_sext(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_sext(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i16> [[X]], splat (i16 8191)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[AND]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %and = and <4 x i16> %x, splat (i16 8191)
+ %sext = sext <4 x i16> %and to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %sext)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; neg is incompatible with add constraints, expected not to combine
+define i1 @add_neg(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_neg(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw <4 x i32> zeroinitializer, [[ZEXT]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[NEG]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %neg = sub nsw <4 x i32> zeroinitializer, %zext
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %neg)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, splat (i32 7)
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_shl(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_shl_ne(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_ne(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nuw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp ne i32 %red, 0
+ ret i1 %cmp
+}
+
+; x may be negative, expected not to combine
+define i1 @add_shl_negative(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_negative(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[X]], [[YMASKED]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %x, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; comparison with non-zero, expected not to combine
+define i1 @add_shl_nonzero_cmp(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_nonzero_cmp(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 42
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %zext, %ymasked
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 42
+ ret i1 %cmp
+}
+
+; shl has multiple uses, expected not to combine
+define i1 @add_shl_multiuse(<4 x i16> %x, <4 x i32> %y, ptr %p) {
+; CHECK-LABEL: define i1 @add_shl_multiuse(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[YMASKED:%.*]] = and <4 x i32> [[Y]], splat (i32 7)
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[YMASKED]]
+; CHECK-NEXT: store <4 x i32> [[SHL]], ptr [[P]], align 16
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %ymasked = and <4 x i32> %y, splat (i32 7)
+ %shl = shl nsw <4 x i32> %zext, %ymasked
+ store <4 x i32> %shl, ptr %p
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; shift amount unbounded, expected not to combine
+define i1 @add_shl_unbounded(<4 x i16> %x, <4 x i32> %y) {
+; CHECK-LABEL: define i1 @add_shl_unbounded(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[SHL:%.*]] = shl nsw <4 x i32> [[ZEXT]], [[Y]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %shl = shl nsw <4 x i32> %zext, %y
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shl)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul_nonsplat(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_nonsplat(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 2, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @add_mul_poison(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_poison(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ZEXT]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 poison, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; constant has zero lane, expected not to combine
+define i1 @add_mul_zero_lane(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_zero_lane(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 1, i32 0, i32 3, i32 4>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 1, i32 0, i32 3, i32 4>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+; constant has negative lane, expected not to combine
+define i1 @add_mul_neg_const(<4 x i16> %x) {
+; CHECK-LABEL: define i1 @add_mul_neg_const(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[ZEXT]], <i32 3, i32 -1, i32 2, i32 5>
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[MUL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %zext = zext <4 x i16> %x to <4 x i32>
+ %mul = mul nuw <4 x i32> %zext, <i32 3, i32 -1, i32 2, i32 5>
+ %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+declare void @foo(<4 x i32>)
+
+define void @or_zext_two_blocks(<4 x i16> %x) {
+; CHECK-LABEL: define void @or_zext_two_blocks(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: call void @foo(<4 x i32> [[A]])
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = zext <4 x i16> %x to <4 x i32>
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+ %cmp = icmp eq i32 %red, 0
+ br i1 %cmp, label %then, label %exit
+
+then:
+ call void @foo(<4 x i32> %a)
+ br label %exit
+
+exit:
+ ret void
+}
+
+define void @or_shl_two_blocks(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define void @or_shl_two_blocks(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = shl nuw <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RED]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: call void @foo(<4 x i32> [[A]])
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = shl nuw <4 x i32> %x, %y
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+ %cmp = icmp eq i32 %red, 0
+ br i1 %cmp, label %then, label %exit
+
+then:
+ call void @foo(<4 x i32> %a)
+ br label %exit
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list