[llvm] 842a236 - [SLP]Alternate vectorization for cmp instructions.

Wed Feb 2 10:34:15 PST 2022

Author: Alexey Bataev
Date: 2022-02-02T10:32:52-08:00
New Revision: 842a2360a84692f2e4c37cc3e652640e6627d004

URL: https://github.com/llvm/llvm-project/commit/842a2360a84692f2e4c37cc3e652640e6627d004
DIFF: https://github.com/llvm/llvm-project/commit/842a2360a84692f2e4c37cc3e652640e6627d004.diff

LOG: [SLP]Alternate vectorization for cmp instructions.

Added support for alternate ops vectorization of the cmp instructions.
It allows to vectorize either cmp instructions with same/swapped
predicate but different (swapped) operands kinds or cmp instructions
with different predicates and compatible operands kinds.

Differential Revision: https://reviews.llvm.org/D115955

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
    llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 25bf69729c70f..4f0e172e1ce56 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -471,17 +471,36 @@ static bool isValidForAlternation(unsigned Opcode) {
   return true;
 }
 
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+                                       unsigned BaseIndex = 0);
+
+/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
+/// compatible instructions or constants, or just some other regular values.
+static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
+                                Value *Op1) {
+  return (isConstant(BaseOp0) && isConstant(Op0)) ||
+         (isConstant(BaseOp1) && isConstant(Op1)) ||
+         (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
+          !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
+         getSameOpcode({BaseOp0, Op0}).getOpcode() ||
+         getSameOpcode({BaseOp1, Op1}).getOpcode();
+}
+
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
-                                       unsigned BaseIndex = 0) {
+                                       unsigned BaseIndex) {
   // Make sure these are all Instructions.
   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
 
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+  bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
+  CmpInst::Predicate BasePred =
+      IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
+              : CmpInst::BAD_ICMP_PREDICATE;
   unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
   unsigned AltOpcode = Opcode;
   unsigned AltIndex = BaseIndex;
@@ -514,6 +533,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           continue;
         }
       }
+    } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
+      auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
+      auto *Inst = cast<Instruction>(VL[Cnt]);
+      Type *Ty0 = BaseInst->getOperand(0)->getType();
+      Type *Ty1 = Inst->getOperand(0)->getType();
+      if (Ty0 == Ty1) {
+        Value *BaseOp0 = BaseInst->getOperand(0);
+        Value *BaseOp1 = BaseInst->getOperand(1);
+        Value *Op0 = Inst->getOperand(0);
+        Value *Op1 = Inst->getOperand(1);
+        CmpInst::Predicate CurrentPred =
+            cast<CmpInst>(VL[Cnt])->getPredicate();
+        CmpInst::Predicate SwappedCurrentPred =
+            CmpInst::getSwappedPredicate(CurrentPred);
+        // Check for compatible operands. If the corresponding operands are not
+        // compatible - need to perform alternate vectorization.
+        if (InstOpcode == Opcode) {
+          if (BasePred == CurrentPred &&
+              areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1))
+            continue;
+          if (BasePred == SwappedCurrentPred &&
+              areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0))
+            continue;
+          if (E == 2 &&
+              (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
+            continue;
+          auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+          CmpInst::Predicate AltPred = AltInst->getPredicate();
+          Value *AltOp0 = AltInst->getOperand(0);
+          Value *AltOp1 = AltInst->getOperand(1);
+          // Check if operands are compatible with alternate operands.
+          if (AltPred == CurrentPred &&
+              areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1))
+            continue;
+          if (AltPred == SwappedCurrentPred &&
+              areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0))
+            continue;
+        }
+        if (BaseIndex == AltIndex && BasePred != CurrentPred) {
+          assert(isValidForAlternation(Opcode) &&
+                 isValidForAlternation(InstOpcode) &&
+                 "Cast isn't safe for alternation, logic needs to be updated!");
+          AltIndex = Cnt;
+          continue;
+        }
+        auto *AltInst = cast<CmpInst>(VL[AltIndex]);
+        CmpInst::Predicate AltPred = AltInst->getPredicate();
+        if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
+            AltPred == CurrentPred || AltPred == SwappedCurrentPred)
+          continue;
+      }
     } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
       continue;
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
@@ -4354,9 +4424,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
-      if (isa<BinaryOperator>(VL0)) {
+      auto *CI = dyn_cast<CmpInst>(VL0);
+      if (isa<BinaryOperator>(VL0) || CI) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        if (!CI || all_of(VL, [](Value *V) {
+              return cast<CmpInst>(V)->isCommutative();
+            })) {
+          reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        } else {
+          CmpInst::Predicate P0 = CI->getPredicate();
+          CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate();
+          assert(P0 != AltP0 &&
+                 "Expected 
diff erent main/alternate predicates.");
+          CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+          Value *BaseOp0 = VL0->getOperand(0);
+          Value *BaseOp1 = VL0->getOperand(1);
+          // Collect operands - commute if it uses the swapped predicate or
+          // alternate operation.
+          for (Value *V : VL) {
+            auto *Cmp = cast<CmpInst>(V);
+            Value *LHS = Cmp->getOperand(0);
+            Value *RHS = Cmp->getOperand(1);
+            CmpInst::Predicate CurrentPred = CI->getPredicate();
+            if (P0 == AltP0Swapped) {
+              if ((P0 == CurrentPred &&
+                   !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+                  (AltP0 == CurrentPred &&
+                   areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))
+                std::swap(LHS, RHS);
+            } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
+              std::swap(LHS, RHS);
+            }
+            Left.push_back(LHS);
+            Right.push_back(RHS);
+          }
+        }
         TE->setOperand(0, Left);
         TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -5288,7 +5390,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
              ((Instruction::isBinaryOp(E->getOpcode()) &&
                Instruction::isBinaryOp(E->getAltOpcode())) ||
               (Instruction::isCast(E->getOpcode()) &&
-               Instruction::isCast(E->getAltOpcode()))) &&
+               Instruction::isCast(E->getAltOpcode())) ||
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
              "Invalid Shuffle Vector Operand");
       InstructionCost ScalarCost = 0;
       if (NeedToShuffleReuses) {
@@ -5336,6 +5439,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
                                                CostKind);
+      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+        VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+                                          Builder.getInt1Ty(),
+                                          CI0->getPredicate(), CostKind, VL0);
+        VecCost += TTI->getCmpSelInstrCost(
+            E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+            cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
+            E->getAltOp());
       } else {
         Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
@@ -5352,6 +5463,27 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
+              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
+              auto *CI = cast<CmpInst>(I);
+              CmpInst::Predicate P0 = CI0->getPredicate();
+              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+              assert(P0 != AltP0 &&
+                     "Expected 
diff erent main/alternate predicates.");
+              CmpInst::Predicate AltP0Swapped =
+                  CmpInst::getSwappedPredicate(AltP0);
+              CmpInst::Predicate CurrentPred = CI->getPredicate();
+              if (P0 == AltP0Swapped)
+                return (P0 == CurrentPred &&
+                        !areCompatibleCmpOps(
+                            CI0->getOperand(0), CI0->getOperand(1),
+                            CI->getOperand(0), CI->getOperand(1))) ||
+                       (AltP0 == CurrentPred &&
+                        !areCompatibleCmpOps(
+                            CI0->getOperand(0), CI0->getOperand(1),
+                            CI->getOperand(1), CI->getOperand(0)));
+              return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+            }
             return I->getOpcode() == E->getAltOpcode();
           },
           Mask);
@@ -6834,11 +6966,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
              ((Instruction::isBinaryOp(E->getOpcode()) &&
                Instruction::isBinaryOp(E->getAltOpcode())) ||
               (Instruction::isCast(E->getOpcode()) &&
-               Instruction::isCast(E->getAltOpcode()))) &&
+               Instruction::isCast(E->getAltOpcode())) ||
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS = nullptr, *RHS = nullptr;
-      if (Instruction::isBinaryOp(E->getOpcode())) {
+      if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
         setInsertPointAfterBundle(E);
         LHS = vectorizeTree(E->getOperand(0));
         RHS = vectorizeTree(E->getOperand(1));
@@ -6858,6 +6991,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
             static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
         V1 = Builder.CreateBinOp(
             static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+        V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+        auto *AltCI = cast<CmpInst>(E->getAltOp());
+        CmpInst::Predicate AltPred = AltCI->getPredicate();
+        unsigned AltIdx =
+            std::distance(E->Scalars.begin(), find(E->Scalars, AltCI));
+        if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx])
+          AltPred = CmpInst::getSwappedPredicate(AltPred);
+        V1 = Builder.CreateCmp(AltPred, LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
@@ -6882,6 +7024,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
+              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
+              auto *CI = cast<CmpInst>(I);
+              CmpInst::Predicate P0 = CI0->getPredicate();
+              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+              assert(P0 != AltP0 &&
+                     "Expected 
diff erent main/alternate predicates.");
+              CmpInst::Predicate AltP0Swapped =
+                  CmpInst::getSwappedPredicate(AltP0);
+              CmpInst::Predicate CurrentPred = CI->getPredicate();
+              if (P0 == AltP0Swapped)
+                return (P0 == CurrentPred &&
+                        !areCompatibleCmpOps(
+                            CI0->getOperand(0), CI0->getOperand(1),
+                            CI->getOperand(0), CI->getOperand(1))) ||
+                       (AltP0 == CurrentPred &&
+                        !areCompatibleCmpOps(
+                            CI0->getOperand(0), CI0->getOperand(1),
+                            CI->getOperand(1), CI->getOperand(0)));
+              return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+            }
             return I->getOpcode() == E->getAltOpcode();
           },
           Mask, &OpScalars, &AltScalars);

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index 486faf75fb7a7..83cab29aa883a 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -90,24 +90,17 @@ return:
 define float @test_merge_anyof_v4sf(<4 x float> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4sf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[T]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[T]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[T]], i64 0
-; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x float> [[T]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0
-; CHECK-NEXT:    [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]]
-; CHECK-NEXT:    [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]]
-; CHECK-NEXT:    [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]]
-; CHECK-NEXT:    [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00
-; CHECK-NEXT:    [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x float> [[SHUFFLE]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <8 x float> [[SHUFFLE]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[SHIFT]], [[T]]
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:
@@ -420,24 +413,18 @@ return:
 define float @test_merge_anyof_v4si(<4 x i32> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4si(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT:    [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]]
-; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], <i32 1, i32 1, i32 1, i32 1, i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], <i32 1, i32 1, i32 1, i32 1, i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]]
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x i32> [[TMP5]], i64 0
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[ADD]] to float
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
index 8f8931d0a6c1d..0ba0fb8f2d8b2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
 
 ;
 ; Check that we can commute operands based on the predicate.
@@ -235,46 +235,14 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
 }
 
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; SSE-LABEL: @fcmp_ord_uno_v4i32(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; SSE-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1
-; SSE-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; SSE-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; SSE-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX-LABEL: @fcmp_ord_uno_v4i32(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
-; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; AVX-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; AVX-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; AVX-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
-; AVX-NEXT:    [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; AVX-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; CHECK-LABEL: @fcmp_ord_uno_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
index 3180078691e40..8cdd2c4dd2e36 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
 
 ;
 ; Check that we can commute operands based on the predicate.
@@ -235,46 +235,14 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
 }
 
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; SSE-LABEL: @fcmp_ord_uno_v4i32(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; SSE-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1
-; SSE-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; SSE-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; SSE-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX-LABEL: @fcmp_ord_uno_v4i32(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
-; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; AVX-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; AVX-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; AVX-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
-; AVX-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i64 0
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; AVX-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; AVX-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; CHECK-LABEL: @fcmp_ord_uno_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index ee3481e242d04..9c7826442ab05 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,36 +93,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 }
 
 define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_
diff _preds(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X3]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], zeroinitializer
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
-; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP4]], i1 false
-; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
-; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP5]], i1 false
-; SSE-NEXT:    ret i1 [[S3]]
-;
-; AVX-LABEL: @logical_and_icmp_
diff _preds(
-; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; AVX-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 0
-; AVX-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; AVX-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; AVX-NEXT:    ret i1 [[S3]]
+; CHECK-LABEL: @logical_and_icmp_
diff _preds(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[X]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -214,19 +191,13 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -251,28 +222,53 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    call void @use1(i1 [[C2]])
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; SSE-NEXT:    call void @use1(i1 [[C2]])
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false
+; SSE-NEXT:    [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false
+; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; AVX-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; AVX-NEXT:    call void @use1(i1 [[TMP4]])
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP5]], i1 false
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP4]], i1 false
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP7]], i1 false
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; AVX-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[TMP8]], i1 false
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP9]], i1 false
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP10]], i1 false
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP11]], i1 false
+; AVX-NEXT:    ret i1 [[S7]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -298,28 +294,49 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    call void @use1(i1 [[S2]])
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select(
+; SSE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; SSE-NEXT:    [[C0:%.*]] = icmp slt i32 [[TMP4]], 42
+; SSE-NEXT:    [[C1:%.*]] = icmp slt i32 [[TMP3]], 42
+; SSE-NEXT:    [[C2:%.*]] = icmp slt i32 [[TMP2]], 42
+; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 42>
+; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 42>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[TMP1]], 17
+; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; SSE-NEXT:    call void @use1(i1 [[S2]])
+; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[S2]], i1 false
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[OP_EXTRA]], i1 [[D3]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select(
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; AVX-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[TMP5]], i1 [[TMP4]], i1 false
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
+; AVX-NEXT:    call void @use1(i1 [[S2]])
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP7]], i1 false
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; AVX-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[TMP8]], i1 false
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP9]], i1 false
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP10]], i1 false
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP11]], i1 false
+; AVX-NEXT:    ret i1 [[S7]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -395,25 +412,47 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 }
 
 define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_partial(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp_partial(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT:    [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false
+; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_partial(
+; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT:    [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
+; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT:    ret i1 [[S7]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -439,26 +478,13 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_clamp_pred_
diff (<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_pred_
diff (
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    [[C3:%.*]] = icmp ult i32 [[X3]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <8 x i32> [[SHUFFLE]], <i32 42, i32 42, i32 42, i32 42, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1