[llvm] 7ea03f0 - [SLP]Improve reductions analysis and emission, part 1.

Mon May 2 12:08:43 PDT 2022

Author: Alexey Bataev
Date: 2022-05-02T12:03:58-07:00
New Revision: 7ea03f0b4e4ec5d91d48ba2976f5adc299089ffd

URL: https://github.com/llvm/llvm-project/commit/7ea03f0b4e4ec5d91d48ba2976f5adc299089ffd
DIFF: https://github.com/llvm/llvm-project/commit/7ea03f0b4e4ec5d91d48ba2976f5adc299089ffd.diff

LOG: [SLP]Improve reductions analysis and emission, part 1.

Currently SLP vectorizer walks through the instructions and selects
3 main classes of values: 1) reduction operations - instructions with same
reduction opcode (add, mul, min/max, etc.), which build the reduction,
2) reduced values - instructions with the same opcodes, but different
from the reduction opcode, 3) extra arguments - all other values,
instructions from the different basic block rather than the root node,
instructions with to many/less uses.

This scheme is not very efficient. It excludes some instructions and all
non-instruction values from the reductions (constants, proficient
gathers), to many possibly reduced values are marked as extra arguments.
Patch improves this process by introducing a bit extended analysis
stage. During this stage, we still try to select 3 classes of the
values: 1) reduction operations - same as before, 2) possibly reduced
values - all instructions from the current block/non-instructions, which
may build a vectorization tree, 3) extra arguments - instructions from
the different basic blocks. Additionally, an extra sorting of the
possibly reduced values occurs to build the scalar sequences which
highly likely will bed vectorized, e.g. loads are grouped by the
distance between them, constants are grouped together, cmp instructions
are sorted by their compare types and predicates, extractelement
instructions are sorted by the vector operand, etc. Also, these groups
are reordered by their length so the longest group is the first in the
list of the possibly reduced values.

The vectorization process tries to emit the reductions for all these
groups. These reductions, remaining non-vectorized possible reduced
values and extra arguments are then combined into the final expression
just like it was before.

Differential Revision: https://reviews.llvm.org/D114171

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
    llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
    llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6b1bb8539a5c..a17396e97ebb7 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2042,6 +2042,36 @@ class BoUpSLP {
     DeletedInstructions.insert(I);
   }
 
+  /// Checks if the instruction was already analyzed for being possible
+  /// reduction root.
+  bool isAnalizedReductionRoot(Instruction *I) const {
+    return AnalizedReductionsRoots.count(I);
+  }
+  /// Register given instruction as already analyzed for being possible
+  /// reduction root.
+  void analyzedReductionRoot(Instruction *I) {
+    AnalizedReductionsRoots.insert(I);
+  }
+  /// Checks if the provided list of reduced values was checked already for
+  /// vectorization.
+  bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+    return AnalyzedReductionVals.contains(hash_value(VL));
+  }
+  /// Adds the list of reduced values to list of already checked values for the
+  /// vectorization.
+  void analyzedReductionVals(ArrayRef<Value *> VL) {
+    AnalyzedReductionVals.insert(hash_value(VL));
+  }
+  /// Clear the list of the analyzed reduction root instructions.
+  void clearReductionData() {
+    AnalizedReductionsRoots.clear();
+    AnalyzedReductionVals.clear();
+  }
+  /// Checks if the given value is gathered in one of the nodes.
+  bool isGathered(Value *V) const {
+    return MustGather.contains(V);
+  }
+
   ~BoUpSLP();
 
 private:
@@ -2603,6 +2633,12 @@ class BoUpSLP {
   /// previously deleted instruction.
   DenseSet<Instruction *> DeletedInstructions;
 
+  /// Set of the instruction, being analyzed already for reductions.
+  SmallPtrSet<Instruction *, 16> AnalizedReductionsRoots;
+
+  /// Set of hashes for the list of reduction values already being analyzed.
+  DenseSet<size_t> AnalyzedReductionVals;
+
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
   /// can be nullptr, it means that this Internal Scalar will be used later,
@@ -4041,6 +4077,83 @@ static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
 }
 #endif
 
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+    Value *V, const TargetLibraryInfo *TLI,
+    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+    bool AllowAlternate) {
+  hash_code Key = hash_value(V->getValueID() + 2);
+  hash_code SubKey = hash_value(0);
+  // Sort the loads by the distance between the pointers.
+  if (auto *LI = dyn_cast<LoadInst>(V)) {
+    Key = hash_combine(hash_value(Instruction::Load), Key);
+    if (LI->isSimple())
+      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+    else
+      SubKey = hash_value(LI);
+  } else if (isVectorLikeInstWithConstOps(V)) {
+    // Sort extracts by the vector operands.
+    if (isa<ExtractElementInst, UndefValue>(V))
+      Key = hash_value(Value::UndefValueVal + 1);
+    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+      if (!isUndefVector(EI->getVectorOperand()) &&
+          !isa<UndefValue>(EI->getIndexOperand()))
+        SubKey = hash_value(EI->getVectorOperand());
+    }
+  } else if (auto *I = dyn_cast<Instruction>(V)) {
+    // Sort other instructions just by the opcodes except for CMPInst.
+    // For CMP also sort by the predicate kind.
+    if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+        isValidForAlternation(I->getOpcode())) {
+      if (AllowAlternate)
+        Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+      else
+        Key = hash_combine(hash_value(I->getOpcode()), Key);
+      SubKey = hash_combine(
+          hash_value(I->getOpcode()), hash_value(I->getType()),
+          hash_value(isa<BinaryOperator>(I)
+                         ? I->getType()
+                         : cast<CastInst>(I)->getOperand(0)->getType()));
+    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Pred = CI->getPredicate();
+      if (CI->isCommutative())
+        Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+      SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+                            hash_value(SwapPred),
+                            hash_value(CI->getOperand(0)->getType()));
+    } else if (auto *Call = dyn_cast<CallInst>(I)) {
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+      if (isTriviallyVectorizable(ID))
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+      else if (!VFDatabase(*Call).getMappings(*Call).empty())
+        SubKey = hash_combine(hash_value(I->getOpcode()),
+                              hash_value(Call->getCalledFunction()));
+      else
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+                              hash_value(Op.Tag), SubKey);
+    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+        SubKey = hash_value(Gep->getPointerOperand());
+      else
+        SubKey = hash_value(Gep);
+    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+               !isa<ConstantInt>(I->getOperand(1))) {
+      // Do not try to vectorize instructions with potentially high cost.
+      SubKey = hash_value(I);
+    } else {
+      SubKey = hash_value(I->getOpcode());
+    }
+    Key = hash_combine(hash_value(I->getParent()), Key);
+  }
+  return std::make_pair(Key, SubKey);
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -6459,6 +6572,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
     }
   }
 
+  if (UsedTEs.empty()) {
+    assert(all_of(TE->Scalars, UndefValue::classof) &&
+           "Expected vector of undefs only.");
+    return None;
+  }
+
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
     // Try to find the perfect match in another gather node at first.
@@ -6612,11 +6731,15 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // should not be scheduled.
   if (E->State != TreeEntry::NeedToGather &&
       doesNotNeedToSchedule(E->Scalars)) {
-    BasicBlock::iterator InsertPt;
+    Instruction *InsertInst;
     if (all_of(E->Scalars, isUsedOutsideBlock))
-      InsertPt = FindLastInst()->getIterator();
+      InsertInst = FindLastInst();
     else
-      InsertPt = FindFirstInst()->getIterator();
+      InsertInst = FindFirstInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(InsertInst))
+      InsertInst = BB->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = InsertInst->getIterator();
     Builder.SetInsertPoint(BB, InsertPt);
     Builder.SetCurrentDebugLocation(Front->getDebugLoc());
     return;
@@ -6658,13 +6781,17 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // not ideal. However, this should be exceedingly rare since it requires that
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
-  if (!LastInst)
+  if (!LastInst) {
     LastInst = FindLastInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(LastInst))
+      LastInst = BB->getFirstNonPHI()->getPrevNode();
+  }
   assert(LastInst && "Failed to find last instruction in bundle");
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
-  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+  Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
@@ -8804,6 +8931,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
+    // Start new block - clear the list of reduction roots.
+    R.clearReductionData();
     collectSeedInstructions(BB);
 
     // Vectorize trees that end at stores.
@@ -9273,15 +9402,16 @@ class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   ReductionOpsListType ReductionOps;
-  SmallVector<Value *, 32> ReducedVals;
+  /// List of possibly reduced values.
+  SmallVector<SmallVector<Value *>> ReducedVals;
+  /// Maps reduced value to the corresponding reduction operation.
+  DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
 
-  const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
   static bool isCmpSelMinMax(Instruction *I) {
     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
            RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -9325,26 +9455,6 @@ class HorizontalReduction {
     return I->getOperand(Index);
   }
 
-  /// Checks if the ParentStackElem.first should be marked as a reduction
-  /// operation with an extra argument or as extra argument itself.
-  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
-                    Value *ExtraArg) {
-    if (ExtraArgs.count(ParentStackElem.first)) {
-      ExtraArgs[ParentStackElem.first] = nullptr;
-      // We ran into something like:
-      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
-      // The whole ParentStackElem.first should be considered as an extra value
-      // in this case.
-      // Do not perform analysis of remaining operands of ParentStackElem.first
-      // instruction, this whole instruction is an extra argument.
-      ParentStackElem.second = INVALID_OPERAND_INDEX;
-    } else {
-      // We ran into something like:
-      // ParentStackElem.first += ... + ExtraArg + ...
-      ExtraArgs[ParentStackElem.first] = ExtraArg;
-    }
-  }
-
   /// Creates reduction operation with the current opcode.
   static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
@@ -9429,7 +9539,7 @@ class HorizontalReduction {
   /// Creates reduction operation with the current opcode with the IR flags
   /// from \p I.
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
-                         Value *RHS, const Twine &Name, Instruction *I) {
+                         Value *RHS, const Twine &Name, Value *I) {
     auto *SelI = dyn_cast<SelectInst>(I);
     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
     if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -9440,8 +9550,10 @@ class HorizontalReduction {
     return Op;
   }
 
-  static RecurKind getRdxKind(Instruction *I) {
-    assert(I && "Expected instruction for reduction matching");
+  static RecurKind getRdxKind(Value *V) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return RecurKind::None;
     if (match(I, m_Add(m_Value(), m_Value())))
       return RecurKind::Add;
     if (match(I, m_Mul(m_Value(), m_Value())))
@@ -9603,7 +9715,9 @@ class HorizontalReduction {
   HorizontalReduction() = default;
 
   /// Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+                                 ScalarEvolution &SE, const DataLayout &DL,
+                                 const TargetLibraryInfo &TLI) {
     assert((!Phi || is_contained(Phi->operands(), Inst)) &&
            "Phi needs to use the binary operator");
     assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -9647,124 +9761,168 @@ class HorizontalReduction {
 
     ReductionRoot = Inst;
 
-    // The opcode for leaf values that we perform a reduction on.
-    // For example: load(x) + load(y) + load(z) + fptoui(w)
-    // The leaf opcode for 'w' does not match, so we don't include it as a
-    // potential candidate for the reduction.
-    unsigned LeafOpcode = 0;
-
-    // Post-order traverse the reduction tree starting at Inst. We only handle
-    // true trees containing binary operators or selects.
-    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
-    Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
-    initReductionOps(Inst);
-    while (!Stack.empty()) {
-      Instruction *TreeN = Stack.back().first;
-      unsigned EdgeToVisit = Stack.back().second++;
-      const RecurKind TreeRdxKind = getRdxKind(TreeN);
-      bool IsReducedValue = TreeRdxKind != RdxKind;
-
-      // Postorder visit.
-      if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
-        if (IsReducedValue)
-          ReducedVals.push_back(TreeN);
-        else {
-          auto ExtraArgsIter = ExtraArgs.find(TreeN);
-          if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
-            // Check if TreeN is an extra argument of its parent operation.
-            if (Stack.size() <= 1) {
-              // TreeN can't be an extra argument as it is a root reduction
-              // operation.
-              return false;
-            }
-            // Yes, TreeN is an extra argument, do not add it to a list of
-            // reduction operations.
-            // Stack[Stack.size() - 2] always points to the parent operation.
-            markExtraArg(Stack[Stack.size() - 2], TreeN);
-            ExtraArgs.erase(TreeN);
-          } else
-            addReductionOps(TreeN);
+    // Iterate through all the operands of the possible reduction tree and
+    // gather all the reduced values, sorting them by their value id.
+    BasicBlock *BB = Inst->getParent();
+    bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+    SmallVector<Instruction *> Worklist(1, Inst);
+    // Checks if the operands of the \p TreeN instruction are also reduction
+    // operations or should be treated as reduced values or an extra argument,
+    // which is not part of the reduction.
+    auto &&CheckOperands = [this, IsCmpSelMinMax,
+                            BB](Instruction *TreeN,
+                                SmallVectorImpl<Value *> &ExtraArgs,
+                                SmallVectorImpl<Value *> &PossibleReducedVals,
+                                SmallVectorImpl<Instruction *> &ReductionOps) {
+      for (int I = getFirstOperandIndex(TreeN),
+               End = getNumberOfOperands(TreeN);
+           I < End; ++I) {
+        Value *EdgeVal = getRdxOperand(TreeN, I);
+        ReducedValsToOps[EdgeVal].push_back(TreeN);
+        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+        // Edge has wrong parent - mark as an extra argument.
+        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+            !hasSameParent(EdgeInst, BB)) {
+          ExtraArgs.push_back(EdgeVal);
+          continue;
         }
-        // Retract.
-        Stack.pop_back();
-        continue;
-      }
-
-      // Visit operands.
-      Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
-      auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-      if (!EdgeInst) {
-        // Edge value is not a reduction instruction or a leaf instruction.
-        // (It may be a constant, function argument, or something else.)
-        markExtraArg(Stack.back(), EdgeVal);
-        continue;
+        // If the edge is not an instruction, or it is 
diff erent from the main
+        // reduction opcode or has too many uses - possible reduced value.
+        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+            !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+          PossibleReducedVals.push_back(EdgeVal);
+          continue;
+        }
+        ReductionOps.push_back(EdgeInst);
       }
-      RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
-      // Continue analysis if the next operand is a reduction operation or
-      // (possibly) a leaf value. If the leaf value opcode is not set,
-      // the first met operation != reduction operation is considered as the
-      // leaf opcode.
-      // Only handle trees in the current basic block.
-      // Each tree node needs to have minimal number of users except for the
-      // ultimate reduction.
-      const bool IsRdxInst = EdgeRdxKind == RdxKind;
-      if (EdgeInst != Phi && EdgeInst != Inst &&
-          hasSameParent(EdgeInst, Inst->getParent()) &&
-          hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
-          (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
-        if (IsRdxInst) {
-          // We need to be able to reassociate the reduction operations.
-          if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
-            // I is an extra argument for TreeN (its parent operation).
-            markExtraArg(Stack.back(), EdgeInst);
-            continue;
-          }
-        } else if (!LeafOpcode) {
-          LeafOpcode = EdgeInst->getOpcode();
+    };
+    // Try to regroup reduced values so that it gets more profitable to try to
+    // reduce them. Values are grouped by their value ids, instructions - by
+    // instruction op id and/or alternate op id, plus do extra analysis for
+    // loads (grouping them by the distabce between pointers) and cmp
+    // instructions (grouping them by the predicate).
+    MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+        PossibleReducedVals;
+    initReductionOps(Inst);
+    while (!Worklist.empty()) {
+      Instruction *TreeN = Worklist.pop_back_val();
+      SmallVector<Value *> Args;
+      SmallVector<Value *> PossibleRedVals;
+      SmallVector<Instruction *> PossibleReductionOps;
+      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+      // If too many extra args - mark the instruction itself as a reduction
+      // value, not a reduction operation.
+      if (Args.size() < 2) {
+        addReductionOps(TreeN);
+        // Add extra args.
+        if (!Args.empty()) {
+          assert(Args.size() == 1 && "Expected only single argument.");
+          ExtraArgs[TreeN] = Args.front();
         }
-        Stack.push_back(
-            std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
-        continue;
+        // Add reduction values. The values are sorted for better vectorization
+        // results.
+        for (Value *V : PossibleRedVals) {
+          size_t Key, Idx;
+          std::tie(Key, Idx) = generateKeySubkey(
+              V, &TLI,
+              [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+                for (const auto &LoadData : PossibleReducedVals[Key]) {
+                  auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                  if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+                                      LI->getType(), LI->getPointerOperand(),
+                                      DL, SE, /*StrictCheck=*/true))
+                    return hash_value(RLI->getPointerOperand());
+                }
+                return hash_value(LI->getPointerOperand());
+              },
+              /*AllowAlternate=*/false);
+          ++PossibleReducedVals[Key][Idx]
+                .insert(std::make_pair(V, 0))
+                .first->second;
+        }
+        Worklist.append(PossibleReductionOps.rbegin(),
+                        PossibleReductionOps.rend());
+      } else {
+        size_t Key, Idx;
+        std::tie(Key, Idx) = generateKeySubkey(
+            TreeN, &TLI,
+            [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+              for (const auto &LoadData : PossibleReducedVals[Key]) {
+                auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+                                    LI->getType(), LI->getPointerOperand(), DL,
+                                    SE, /*StrictCheck=*/true))
+                  return hash_value(RLI->getPointerOperand());
+              }
+              return hash_value(LI->getPointerOperand());
+            },
+            /*AllowAlternate=*/false);
+        ++PossibleReducedVals[Key][Idx]
+              .insert(std::make_pair(TreeN, 0))
+              .first->second;
       }
-      // I is an extra argument for TreeN (its parent operation).
-      markExtraArg(Stack.back(), EdgeInst);
     }
+    auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+    // Sort values by the total number of values kinds to start the reduction
+    // from the longest possible reduced values sequences.
+    for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+      auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+      SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+           It != E; ++It) {
+        PossibleRedValsVect.emplace_back();
+        auto RedValsVect = It->second.takeVector();
+        stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+          return P1.second < P2.second;
+        });
+        for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+          PossibleRedValsVect.back().append(Data.second, Data.first);
+      }
+      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+        return P1.size() > P2.size();
+      });
+      ReducedVals.emplace_back();
+      for (ArrayRef<Value *> Data : PossibleRedValsVect)
+        ReducedVals.back().append(Data.rbegin(), Data.rend());
+    }
+    // Sort the reduced values by number of same/alternate opcode and/or pointer
+    // operand.
+    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+      return P1.size() > P2.size();
+    });
     return true;
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    constexpr int ReductionLimit = 4;
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
-    unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < 4)
+    unsigned NumReducedVals = std::accumulate(
+        ReducedVals.begin(), ReducedVals.end(), 0,
+        [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+    if (NumReducedVals < ReductionLimit)
       return nullptr;
 
-    // Intersect the fast-math-flags from all reduction operations.
-    FastMathFlags RdxFMF;
-    RdxFMF.set();
-    for (ReductionOpsType &RdxOp : ReductionOps) {
-      for (Value *RdxVal : RdxOp) {
-        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
-          RdxFMF &= FPMO->getFastMathFlags();
-      }
-    }
-
     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
-    Builder.setFastMathFlags(RdxFMF);
 
+    // Track the reduced values in case if they are replaced by extractelement
+    // because of the vectorization.
+    DenseMap<Value *, WeakTrackingVH> TrackedVals;
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several times, so log each attempt
     // to use it.
     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
+      TrackedVals.try_emplace(Pair.second, Pair.second);
     }
 
     // The compare instruction of a min/max is the insertion point for new
     // instructions and may be replaced with a new compare instruction.
-    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+    auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
       assert(isa<SelectInst>(RdxRootInst) &&
              "Expected min/max reduction to have select root instruction");
       Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9776,141 +9934,289 @@ class HorizontalReduction {
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
-    SmallVector<Value *, 16> IgnoreList;
-    for (ReductionOpsType &RdxOp : ReductionOps)
-      IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
-    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-    if (NumReducedVals > ReduxWidth) {
-      // In the loop below, we are building a tree based on a window of
-      // 'ReduxWidth' values.
-      // If the operands of those values have common traits (compare predicate,
-      // constant operand, etc), then we want to group those together to
-      // minimize the cost of the reduction.
-
-      // TODO: This should be extended to count common operands for
-      //       compares and binops.
-
-      // Step 1: Count the number of times each compare predicate occurs.
-      SmallDenseMap<unsigned, unsigned> PredCountMap;
-      for (Value *RdxVal : ReducedVals) {
-        CmpInst::Predicate Pred;
-        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
-          ++PredCountMap[Pred];
-      }
-      // Step 2: Sort the values so the most common predicates come first.
-      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
-        CmpInst::Predicate PredA, PredB;
-        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
-            match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
-          return PredCountMap[PredA] > PredCountMap[PredB];
-        }
-        return false;
-      });
-    }
+    SmallVector<Value *> IgnoreList;
+    for (ReductionOpsType &RdxOps : ReductionOps)
+      for (Value *RdxOp : RdxOps) {
+        if (!RdxOp)
+          continue;
+        IgnoreList.push_back(RdxOp);
+      }
+    bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+    // Need to track reduced vals, they may be changed during vectorization of
+    // subvectors.
+    for (ArrayRef<Value *> Candidates : ReducedVals)
+      for (Value *V : Candidates)
+        TrackedVals.try_emplace(V, V);
 
+    DenseMap<Value *, unsigned> VectorizedVals;
     Value *VectorizedTree = nullptr;
-    unsigned i = 0;
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, IgnoreList);
-      if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
-        break;
-      if (V.isLoadCombineReductionCandidate(RdxKind))
-        break;
-      V.reorderTopToBottom();
-      V.reorderBottomToTop(/*IgnoreReorder=*/true);
-      V.buildExternalUses(ExternallyUsedValues);
-
-      // For a poison-safe boolean logic reduction, do not replace select
-      // instructions with logic ops. All reduced values will be frozen (see
-      // below) to prevent leaking poison.
-      if (isa<SelectInst>(ReductionRoot) &&
-          isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
-          NumReducedVals != ReduxWidth)
-        break;
+    bool CheckForReusedReductionOps = false;
+    // Try to vectorize elements based on their type.
+    for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+      ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+      InstructionsState S = getSameOpcode(OrigReducedVals);
+      SmallVector<Value *> Candidates;
+      DenseMap<Value *, Value *> TrackedToOrig;
+      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+        Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+        // Check if the reduction value was not overriden by the extractelement
+        // instruction because of the vectorization and exclude it, if it is not
+        // compatible with other values.
+        if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+          if (isVectorLikeInstWithConstOps(Inst) &&
+              (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+            continue;
+        Candidates.push_back(RdxVal);
+        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+      }
+      bool ShuffledExtracts = false;
+      // Try to handle shuffled extractelements.
+      if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+          I + 1 < E) {
+        InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+        if (NextS.getOpcode() == Instruction::ExtractElement &&
+            !NextS.isAltShuffle()) {
+          SmallVector<Value *> CommonCandidates(Candidates);
+          for (Value *RV : ReducedVals[I + 1]) {
+            Value *RdxVal = TrackedVals.find(RV)->second;
+            // Check if the reduction value was not overriden by the
+            // extractelement instruction because of the vectorization and
+            // exclude it, if it is not compatible with other values.
+            if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+              if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+                continue;
+            CommonCandidates.push_back(RdxVal);
+            TrackedToOrig.try_emplace(RdxVal, RV);
+          }
+          SmallVector<int> Mask;
+          if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+            ++I;
+            Candidates.swap(CommonCandidates);
+            ShuffledExtracts = true;
+          }
+        }
+      }
+      unsigned NumReducedVals = Candidates.size();
+      if (NumReducedVals < ReductionLimit)
+        continue;
 
-      V.computeMinimumValueSizes();
+      unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+      unsigned Start = 0;
+      unsigned Pos = Start;
+      // Restarts vectorization attempt with lower vector factor.
+      unsigned PrevReduxWidth = ReduxWidth;
+      bool CheckForReusedReductionOpsLocal = false;
+      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+                                  &CheckForReusedReductionOpsLocal,
+                                  &PrevReduxWidth, &V,
+                                  &IgnoreList](bool IgnoreVL = false) {
+        bool IsAnyRedOpGathered =
+            !IgnoreVL && any_of(IgnoreList, [&V](Value *RedOp) {
+              return V.isGathered(RedOp);
+            });
+        if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+          // Check if any of the reduction ops are gathered. If so, worth
+          // trying again with less number of reduction ops.
+          CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+        }
+        ++Pos;
+        if (Pos < NumReducedVals - ReduxWidth + 1)
+          return IsAnyRedOpGathered;
+        Pos = Start;
+        ReduxWidth /= 2;
+        return IsAnyRedOpGathered;
+      };
+      while (Pos < NumReducedVals - ReduxWidth + 1 &&
+             ReduxWidth >= ReductionLimit) {
+        // Dependency in tree of the reduction ops - drop this attempt, try
+        // later.
+        if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+            Start == 0) {
+          CheckForReusedReductionOps = true;
+          break;
+        }
+        PrevReduxWidth = ReduxWidth;
+        ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+        // Beeing analyzed already - skip.
+        if (V.areAnalyzedReductionVals(VL)) {
+          (void)AdjustReducedVals(/*IgnoreVL=*/true);
+          continue;
+        }
+        // Early exit if any of the reduction values were deleted during
+        // previous vectorization attempts.
+        if (any_of(VL, [&V](Value *RedVal) {
+              auto *RedValI = dyn_cast<Instruction>(RedVal);
+              if (!RedValI)
+                return false;
+              return V.isDeleted(RedValI);
+            }))
+          break;
+        V.buildTree(VL, IgnoreList);
+        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        if (V.isLoadCombineReductionCandidate(RdxKind)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        V.reorderTopToBottom();
+        // No need to reorder the root node at all.
+        V.reorderBottomToTop(/*IgnoreReorder=*/true);
+        // Keep extracted other reduction values, if they are used in the
+        // vectorization trees.
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+            ExternallyUsedValues);
+        for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+          if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+            continue;
+          for_each(ReducedVals[Cnt],
+                   [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+                     if (isa<Instruction>(V))
+                       LocalExternallyUsedValues[TrackedVals[V]];
+                   });
+        }
+        for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+          if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+            continue;
+          if (VectorizedVals.count(Candidates[Cnt]))
+            continue;
+          LocalExternallyUsedValues[Candidates[Cnt]];
+        }
+        V.buildExternalUses(LocalExternallyUsedValues);
+
+        V.computeMinimumValueSizes();
+
+        // Intersect the fast-math-flags from all reduction operations.
+        FastMathFlags RdxFMF;
+        RdxFMF.set();
+        for (Value *U : IgnoreList)
+          if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+            RdxFMF &= FPMO->getFastMathFlags();
+        // Estimate cost.
+        InstructionCost TreeCost = V.getTreeCost(VL);
+        InstructionCost ReductionCost =
+            getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
+        InstructionCost Cost = TreeCost + ReductionCost;
+        if (!Cost.isValid()) {
+          LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+          return nullptr;
+        }
+        if (Cost >= -SLPCostThreshold) {
+          V.getORE()->emit([&]() {
+            return OptimizationRemarkMissed(
+                       SV_NAME, "HorSLPNotBeneficial",
+                       ReducedValsToOps.find(VL[0])->second.front())
+                   << "Vectorizing horizontal reduction is possible"
+                   << "but not beneficial with cost " << ore::NV("Cost", Cost)
+                   << " and threshold "
+                   << ore::NV("Threshold", -SLPCostThreshold);
+          });
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
 
-      // Estimate cost.
-      InstructionCost TreeCost =
-          V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
-      InstructionCost ReductionCost =
-          getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
-      InstructionCost Cost = TreeCost + ReductionCost;
-      if (!Cost.isValid()) {
-        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
-        return nullptr;
-      }
-      if (Cost >= -SLPCostThreshold) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+                          << Cost << ". (HorRdx)\n");
         V.getORE()->emit([&]() {
-          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
-                                          cast<Instruction>(VL[0]))
-                 << "Vectorizing horizontal reduction is possible"
-                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
-                 << " and threshold "
-                 << ore::NV("Threshold", -SLPCostThreshold);
+          return OptimizationRemark(
+                     SV_NAME, "VectorizedHorizontalReduction",
+                     ReducedValsToOps.find(VL[0])->second.front())
+                 << "Vectorized horizontal reduction with cost "
+                 << ore::NV("Cost", Cost) << " and with tree size "
+                 << ore::NV("TreeSize", V.getTreeSize());
         });
-        break;
-      }
 
-      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
-                        << Cost << ". (HorRdx)\n");
-      V.getORE()->emit([&]() {
-        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
-                                  cast<Instruction>(VL[0]))
-               << "Vectorized horizontal reduction with cost "
-               << ore::NV("Cost", Cost) << " and with tree size "
-               << ore::NV("TreeSize", V.getTreeSize());
-      });
+        Builder.setFastMathFlags(RdxFMF);
 
-      // Vectorize a tree.
-      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+        // Vectorize a tree.
+        Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
 
-      // Emit a reduction. If the root is a select (min/max idiom), the insert
-      // point is the compare condition of that select.
-      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-      if (isCmpSelMinMax(RdxRootInst))
-        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
-      else
-        Builder.SetInsertPoint(RdxRootInst);
+        // Emit a reduction. If the root is a select (min/max idiom), the insert
+        // point is the compare condition of that select.
+        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+        if (IsCmpSelMinMax)
+          Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+        else
+          Builder.SetInsertPoint(RdxRootInst);
 
-      // To prevent poison from leaking across what used to be sequential, safe,
-      // scalar boolean logic operations, the reduction operand must be frozen.
-      if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
-        VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+        // To prevent poison from leaking across what used to be sequential,
+        // safe, scalar boolean logic operations, the reduction operand must be
+        // frozen.
+        if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+          VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
 
-      Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+        Value *ReducedSubTree =
+            emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
 
-      if (!VectorizedTree) {
-        // Initialize the final value in the reduction.
-        VectorizedTree = ReducedSubTree;
-      } else {
-        // Update the final value in the reduction.
-        Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                  ReducedSubTree, "op.rdx", ReductionOps);
+        if (!VectorizedTree) {
+          // Initialize the final value in the reduction.
+          VectorizedTree = ReducedSubTree;
+        } else {
+          // Update the final value in the reduction.
+          Builder.SetCurrentDebugLocation(
+              cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                    ReducedSubTree, "op.rdx", ReductionOps);
+        }
+        // Count vectorized reduced values to exclude them from final reduction.
+        for (Value *V : VL)
+          ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+                .first->getSecond();
+        Pos += ReduxWidth;
+        Start = Pos;
+        ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
       }
-      i += ReduxWidth;
-      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
-
     if (VectorizedTree) {
       // Finish the reduction.
-      for (; i < NumReducedVals; ++i) {
-        auto *I = cast<Instruction>(ReducedVals[i]);
-        Builder.SetCurrentDebugLocation(I->getDebugLoc());
-        VectorizedTree =
-            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+      // Need to add extra arguments and not vectorized possible reduction
+      // values.
+      SmallPtrSet<Value *, 8> Visited;
+      for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+        ArrayRef<Value *> Candidates = ReducedVals[I];
+        for (Value *RdxVal : Candidates) {
+          if (!Visited.insert(RdxVal).second)
+            continue;
+          Value *StableRdxVal = RdxVal;
+          auto TVIt = TrackedVals.find(RdxVal);
+          if (TVIt != TrackedVals.end())
+            StableRdxVal = TVIt->second;
+          unsigned NumOps = 0;
+          auto It = VectorizedVals.find(RdxVal);
+          if (It != VectorizedVals.end())
+            NumOps = It->second;
+          for (Instruction *RedOp :
+               makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+                   .drop_back(NumOps)) {
+            Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+            ReductionOpsListType Ops;
+            if (auto *Sel = dyn_cast<SelectInst>(RedOp))
+              Ops.emplace_back().push_back(Sel->getCondition());
+            Ops.emplace_back().push_back(RedOp);
+            VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                      StableRdxVal, "op.rdx", Ops);
+          }
+        }
       }
       for (auto &Pair : ExternallyUsedValues) {
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          ReductionOpsListType Ops;
+          if (auto *Sel = dyn_cast<SelectInst>(I))
+            Ops.emplace_back().push_back(Sel->getCondition());
+          Ops.emplace_back().push_back(I);
+          Value *StableRdxVal = Pair.first;
+          auto TVIt = TrackedVals.find(Pair.first);
+          if (TVIt != TrackedVals.end())
+            StableRdxVal = TVIt->second;
           VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                    Pair.first, "op.extra", I);
+                                    StableRdxVal, "op.rdx", Ops);
         }
       }
 
@@ -9922,20 +10228,30 @@ class HorizontalReduction {
       // deletion.
 #ifndef NDEBUG
       SmallSet<Value *, 4> IgnoreSet;
-      IgnoreSet.insert(IgnoreList.begin(), IgnoreList.end());
+      for (ArrayRef<Value *> RdxOps : ReductionOps)
+        IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
 #endif
-      for (auto *Ignore : IgnoreList) {
+      for (ArrayRef<Value *> RdxOps : ReductionOps) {
+        for (Value *Ignore : RdxOps) {
+          if (!Ignore)
+            continue;
 #ifndef NDEBUG
-        for (auto *U : Ignore->users()) {
-          assert(IgnoreSet.count(U));
-        }
+          for (auto *U : Ignore->users()) {
+            assert(IgnoreSet.count(U) &&
+                   "All users must be either in the reduction ops list.");
+          }
 #endif
-        if (!Ignore->use_empty()) {
-          Value *Undef = UndefValue::get(Ignore->getType());
-          Ignore->replaceAllUsesWith(Undef);
+          if (!Ignore->use_empty()) {
+            Value *Undef = UndefValue::get(Ignore->getType());
+            Ignore->replaceAllUsesWith(Undef);
+          }
+          V.eraseInstruction(cast<Instruction>(Ignore));
         }
-        V.eraseInstruction(cast<Instruction>(Ignore));
       }
+    } else if (!CheckForReusedReductionOps) {
+      for (ReductionOpsType &RdxOps : ReductionOps)
+        for (Value *RdxOp : RdxOps)
+          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
     }
     return VectorizedTree;
   }
@@ -10201,7 +10517,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI,
+    TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+    const TargetLibraryInfo &TLI,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
@@ -10220,7 +10537,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+  // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
   // CmpInsts so we can skip extra attempts in
   // tryToVectorizeHorReductionOrInstOperands and save compile time.
   std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -10228,13 +10545,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   SmallPtrSet<Value *, 8> VisitedInstrs;
   SmallVector<WeakTrackingVH> PostponedInsts;
   bool Res = false;
-  auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
-                                     Value *&B1) -> Value * {
+  auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+                                                     Value *&B0,
+                                                     Value *&B1) -> Value * {
+    if (R.isAnalizedReductionRoot(Inst))
+      return nullptr;
     bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst))
+      if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
         return HorRdx.tryToReduce(R, TTI);
     }
     return nullptr;
@@ -10279,7 +10599,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
       // Do not try to vectorize CmpInst operands, this is done separately.
       // Final attempt for binop args vectorization should happen after the loop
       // to try to find reductions.
-      if (!isa<CmpInst>(Inst))
+      if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
         PostponedInsts.push_back(Inst);
     }
 
@@ -10292,8 +10612,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           if (auto *I = dyn_cast<Instruction>(Op))
             // Do not try to vectorize CmpInst operands,  this is done
             // separately.
-            if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
-                I->getParent() == BB)
+            if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+                !R.isDeleted(I) && I->getParent() == BB)
               Stack.emplace(I, Level);
   }
   // Try to vectorized binops where reductions were not found.
@@ -10317,8 +10637,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
-                                                  ExtraVectorization);
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+                                                  *TLI, ExtraVectorization);
 }
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -10486,12 +10806,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
   for (auto *I : reverse(Instructions)) {
     if (R.isDeleted(I))
       continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    else if (isa<CmpInst>(I))
+    } else if (isa<CmpInst>(I)) {
       PostponedCmps.push_back(I);
+      continue;
+    }
+    // Try to find reductions in buildvector sequnces.
+    OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
   }
   if (AtTerminator) {
     // Try to find reductions first.

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index 76e0f39429aad..00ac1265b0629 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -90,23 +90,16 @@ return:
 define float @test_merge_anyof_v4sf(<4 x float> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4sf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT:    [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]]
-; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x float> [[TMP8]], i64 0
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]]
+; CHECK-NEXT:    [[T_FR7:%.*]] = freeze <4 x float> [[T:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR7]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR7]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR7]]
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:
@@ -419,24 +412,16 @@ return:
 define float @test_merge_anyof_v4si(<4 x i32> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4si(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], <i32 255, i32 255, i32 255, i32 255>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT:    [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]]
-; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0
+; CHECK-NEXT:    [[T_FR7:%.*]] = freeze <4 x i32> [[T:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[T_FR7]], <i32 -256, i32 -256, i32 -256, i32 -256>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], <i32 -255, i32 -255, i32 -255, i32 -255>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP2]], 0
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR7]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR7]]
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[ADD]] to float
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[RETVAL_0]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 3dd757861d5a8..db4f06bb0dd76 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -41,7 +41,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 
 define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 4, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 01d743fcbfe97..4142e3199f05b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -15,10 +15,10 @@ define void @PR28330(i32 %n) {
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; DEFAULT-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
 ; GATHER-LABEL: @PR28330(
@@ -27,10 +27,10 @@ define void @PR28330(i32 %n) {
 ; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; GATHER-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; GATHER-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR28330(
@@ -39,10 +39,10 @@ define void @PR28330(i32 %n) {
 ; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; MAX-COST-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
@@ -92,10 +92,10 @@ define void @PR32038(i32 %n) {
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], -5
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
 ; GATHER-LABEL: @PR32038(
@@ -104,10 +104,10 @@ define void @PR32038(i32 %n) {
 ; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; GATHER:       for.body:
-; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; GATHER-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; GATHER-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], -5
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(
@@ -116,10 +116,10 @@ define void @PR32038(i32 %n) {
 ; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; MAX-COST-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], -5
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 24c69fb667272..12a2cd2a6834f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 962f8ca42bf86..03a29f48a739e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
index 6817269f394dc..0d20a0e01d116 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -8,7 +8,7 @@ define void @mainTest(i32* %ptr) #0  {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -17,10 +17,10 @@ define void @mainTest(i32* %ptr) #0  {
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP7]], 1
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP4]]
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP3]]
-; CHECK-NEXT:    [[OP_EXTRA3]] = add i32 [[OP_EXTRA2]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX3]] = add i32 [[OP_RDX2]], 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       bail_out:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index d15494e092c25..54559149c34bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -7,20 +7,20 @@ define void @test() #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 3, i64 2, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 2, i64 3, i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP7]], 0
-; CHECK-NEXT:    [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX1]] = add i64 [[OP_RDX]], 0
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 44f1fb1dda352..318d45203efba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,98 +5,98 @@
 define void @Test(i32) {
 ; CHECK-LABEL: @Test(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_EXTRA26]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1
+; CHECK-NEXT:    [[TMP34:%.*]] = and <2 x i32> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add <2 x i32> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6
+; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7
+; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2
+; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3
+; FORCE_REDUCTION-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4
+; FORCE_REDUCTION-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5
+; FORCE_REDUCTION-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6
+; FORCE_REDUCTION-NEXT:    [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7
+; FORCE_REDUCTION-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8
+; FORCE_REDUCTION-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9
+; FORCE_REDUCTION-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10
+; FORCE_REDUCTION-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11
+; FORCE_REDUCTION-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12
+; FORCE_REDUCTION-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13
+; FORCE_REDUCTION-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14
+; FORCE_REDUCTION-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
-; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
-; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]]
-; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
-; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
-; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[VAL_41]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = and <2 x i32> [[TMP8]], [[TMP9]]
-; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]]
-; FORCE_REDUCTION-NEXT:    [[TMP12]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
+; FORCE_REDUCTION-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
+; FORCE_REDUCTION-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP26]])
+; FORCE_REDUCTION-NEXT:    [[OP_RDX13:%.*]] = and i32 [[TMP29]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX14:%.*]] = and i32 [[OP_RDX13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX15:%.*]] = and i32 [[OP_RDX14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX16:%.*]] = and i32 [[OP_RDX15]], [[TMP27]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX17:%.*]] = and i32 [[OP_RDX16]], [[TMP28]]
+; FORCE_REDUCTION-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX17]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP31:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP31]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP34:%.*]] = and <2 x i32> [[TMP30]], [[TMP33]]
+; FORCE_REDUCTION-NEXT:    [[TMP35:%.*]] = add <2 x i32> [[TMP30]], [[TMP33]]
+; FORCE_REDUCTION-NEXT:    [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index f878bda14ad84..ba0585f11bade 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -4,20 +4,20 @@
 define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-LABEL: @mainTest(
 ; CHECK-NEXT:  bci_15.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 poison>, i32 [[PARAM:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 31>, i32 [[PARAM:%.*]], i32 0
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    store atomic i32 [[TMP4]], i32* [[VALS:%.*]] unordered, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[V44]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index e911d9ca6d100..9a9e6b42a92bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -60,7 +60,7 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @i(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @i(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]], <4 x i32> <i32 2, i32 1, i32 7, i32 4>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]])
 ; CHECK-NEXT:    ret i8 [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index e3ee84f84f7f1..16da1ee369e40 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -60,7 +60,7 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @i(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @i(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]], <4 x i32> <i32 2, i32 1, i32 7, i32 4>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]])
 ; CHECK-NEXT:    ret i8 [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index d9ca7d565baac..048c5676ae6d4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -13,58 +13,30 @@ define float @baz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7
-; CHECK-NEXT:    [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; CHECK-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
-; CHECK-NEXT:    ret float [[OP_EXTRA1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; CHECK-NEXT:    store float [[OP_RDX1]], float* @res, align 4
+; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
 ; THRESHOLD-LABEL: @baz(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1
-; THRESHOLD-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2
-; THRESHOLD-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3
-; THRESHOLD-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4
-; THRESHOLD-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5
-; THRESHOLD-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6
-; THRESHOLD-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7
-; THRESHOLD-NEXT:    [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; THRESHOLD-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; THRESHOLD-NEXT:    store float [[OP_RDX1]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -107,10 +79,10 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
-; CHECK-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
-; CHECK-NEXT:    ret float [[OP_EXTRA1]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; CHECK-NEXT:    store float [[OP_RDX1]], float* @res, align 4
+; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
 ; THRESHOLD-LABEL: @bazz(
 ; THRESHOLD-NEXT:  entry:
@@ -123,10 +95,10 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
-; THRESHOLD-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; THRESHOLD-NEXT:    store float [[OP_RDX1]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -343,25 +315,25 @@ entry:
 define float @f(float* nocapture readonly %x) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <16 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <16 x float>*
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
@@ -519,8 +491,8 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT:    ret float [[OP_EXTRA]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f1(
 ; THRESHOLD-NEXT:  entry:
@@ -529,8 +501,8 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
   entry:
   %rem = srem i32 %a, %b
@@ -637,50 +609,50 @@ define float @loadadd31(float* nocapture readonly %x) {
 ; CHECK-LABEL: @loadadd31(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
-; CHECK-NEXT:    ret float [[TMP12]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
+; CHECK-NEXT:    ret float [[OP_RDX3]]
 ;
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]])
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]])
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>*
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
-; THRESHOLD-NEXT:    ret float [[TMP12]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
+; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
 ;
   entry:
   %arrayidx = getelementptr inbounds float, float* %x, i64 1
@@ -780,25 +752,25 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; CHECK-NEXT:    ret float [[OP_EXTRA1]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; CHECK-NEXT:    ret float [[OP_RDX2]]
 ;
 ; THRESHOLD-LABEL: @extra_args(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; THRESHOLD-NEXT:    ret float [[OP_RDX2]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -836,29 +808,29 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]]
-; CHECK-NEXT:    ret float [[OP_EXTRA3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
+; CHECK-NEXT:    ret float [[OP_RDX4]]
 ;
 ; THRESHOLD-LABEL: @extra_args_same_several_times(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
-; THRESHOLD-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
-; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA3]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
+; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX4]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -899,28 +871,28 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
 ; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
-; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; CHECK-NEXT:    ret float [[OP_EXTRA1]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
+; CHECK-NEXT:    ret float [[OP_RDX3]]
 ;
 ; THRESHOLD-LABEL: @extra_args_no_replace(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
 ; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
-; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
+; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
+; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1019,9 +991,9 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]]
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]]
-; CHECK-NEXT:    ret i32 [[OP_EXTRA2]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
+; CHECK-NEXT:    ret i32 [[OP_RDX2]]
 ;
 ; THRESHOLD-LABEL: @wobble(
 ; THRESHOLD-NEXT:  bb:
@@ -1034,9 +1006,9 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
 ; THRESHOLD-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]]
-; THRESHOLD-NEXT:    [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]]
-; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA2]]
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
+; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
+; THRESHOLD-NEXT:    ret i32 [[OP_RDX2]]
 ;
   bb:
   %x1 = xor i32 %arg, %bar

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index ef45de71314b1..470fa0caa1cd1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -440,23 +440,11 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ;
 ; MINTREESIZE-LABEL: @reschedule_extract(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -490,23 +478,11 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ;
 ; MINTREESIZE-LABEL: @take_credit(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -564,39 +540,11 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
 ;
 ; MINTREESIZE-LABEL: @_vadd256(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[B]], i32 6
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[B]], i32 5
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[B]], i32 4
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[B]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[A]], i32 6
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = extractelement <8 x float> [[A]], i32 5
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[A]], i32 4
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[A]], i32 3
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = extractelement <8 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP16]], i32 0
-; MINTREESIZE-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP8]], i32 1
-; MINTREESIZE-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0
-; MINTREESIZE-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP7]], i32 1
-; MINTREESIZE-NEXT:    [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i32 0
-; MINTREESIZE-NEXT:    [[TMP22:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP6]], i32 1
-; MINTREESIZE-NEXT:    [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
-; MINTREESIZE-NEXT:    [[TMP24:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP5]], i32 1
-; MINTREESIZE-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
-; MINTREESIZE-NEXT:    [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
-; MINTREESIZE-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
-; MINTREESIZE-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0
-; MINTREESIZE-NEXT:    [[TMP32:%.*]] = insertelement <2 x float> [[TMP31]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP33:%.*]] = fadd <8 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <8 x float> [[TMP33]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <8 x float> [[TMP5]]
 ;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index e42bc3beca3da..d7adf196f7c2a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -475,23 +475,11 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ;
 ; MINTREESIZE-LABEL: @reschedule_extract(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -525,23 +513,11 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ;
 ; MINTREESIZE-LABEL: @take_credit(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -599,39 +575,11 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
 ;
 ; MINTREESIZE-LABEL: @_vadd256(
 ; MINTREESIZE-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7
-; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[B]], i32 6
-; MINTREESIZE-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[B]], i32 5
-; MINTREESIZE-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[B]], i32 4
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[B]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[B]], i32 2
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[B]], i32 1
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[B]], i32 0
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[A]], i32 6
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = extractelement <8 x float> [[A]], i32 5
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[A]], i32 4
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[A]], i32 3
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = extractelement <8 x float> [[A]], i32 2
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = extractelement <8 x float> [[A]], i32 1
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[A]], i32 0
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP16]], i32 0
-; MINTREESIZE-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP8]], i32 1
-; MINTREESIZE-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0
-; MINTREESIZE-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP7]], i32 1
-; MINTREESIZE-NEXT:    [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i32 0
-; MINTREESIZE-NEXT:    [[TMP22:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP6]], i32 1
-; MINTREESIZE-NEXT:    [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
-; MINTREESIZE-NEXT:    [[TMP24:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP5]], i32 1
-; MINTREESIZE-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
-; MINTREESIZE-NEXT:    [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
-; MINTREESIZE-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
-; MINTREESIZE-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0
-; MINTREESIZE-NEXT:    [[TMP32:%.*]] = insertelement <2 x float> [[TMP31]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP33:%.*]] = fadd <8 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <8 x float> [[TMP33]]
+; MINTREESIZE-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7
+; MINTREESIZE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; MINTREESIZE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <8 x float> [[TMP5]]
 ;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
index 0cfd620729524..4aa4ce16cf7be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
@@ -12,23 +12,25 @@ define void @test() #0 {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB1]] ], [ undef, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP18:%.*]], [[BB1]] ], [ undef, [[BB]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 undef, [[TMP]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], [[TMP]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[TMP]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], [[TMP]]
-; CHECK-NEXT:    [[TMP18]] = mul i32 [[TMP17]], [[TMP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BB1]] ], [ undef, [[BB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[TMP]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[TMP]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[TMP]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[TMP]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[TMP]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[TMP]], i32 8
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP]], i32 9
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP]], i32 10
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP]], i32 11
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP]], i32 12
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP]], i32 13
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP]], i32 14
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP]], i32 15
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP15]])
+; CHECK-NEXT:    [[OP_RDX]] = mul i32 [[TMP16]], undef
 ; CHECK-NEXT:    br label [[BB1]]
 ;
 bb:
@@ -63,23 +65,19 @@ define void @test_2(i8 addrspace(1)* %arg, i32 %arg1) #0 {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB:%.*]] ], [ undef, [[BB2]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 0, [[BB]] ], [ undef, [[BB2]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP]], undef
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 undef, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 undef, [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 undef, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 undef, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 undef, [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 undef, [[TMP18]]
-; CHECK-NEXT:    call void @use(i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
+; CHECK-NEXT:    call void @use(i32 [[OP_RDX1]])
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
index 90a259f5efbc9..472ec767f540f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -10,29 +10,28 @@ define i32 @bar() local_unnamed_addr {
 ; CHECK-NEXT:    [[SUB102_1:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[ADD78_2:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    [[SUB102_3:%.*]] = sub nsw i32 undef, undef
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[SUB102_1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD94_1]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SUB86_1]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[ADD78_2]], i32 5
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD78_1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD94_1]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_1]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SUB102_3]], i32 4
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <16 x i32> [[TMP13]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i32> [[TMP14]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <16 x i32> [[TMP15]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP17:%.*]] = add <16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i32> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP18]])
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 4
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <16 x i32> [[TMP14]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP17]])
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP18]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 460f222763b8c..5769a293d0357 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -230,17 +230,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -265,53 +260,17 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; SSE-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; SSE-NEXT:    call void @use1(i1 [[C2]])
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
-; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
-; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
-; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false
-; SSE-NEXT:    [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false
-; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; SSE-NEXT:    ret i1 [[S7]]
-;
-; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; AVX-NEXT:    call void @use1(i1 [[C2]])
-; AVX-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
-; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; AVX-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; AVX-NEXT:    ret i1 [[S7]]
+; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    call void @use1(i1 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -338,27 +297,20 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false
 ; CHECK-NEXT:    call void @use1(i1 [[S2]])
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[S2]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX1]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -393,11 +345,11 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-NEXT:    [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
 ; CHECK-NEXT:    [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
 ; CHECK-NEXT:    [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6
@@ -435,46 +387,37 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 
 define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 ; SSE-LABEL: @logical_and_icmp_clamp_partial(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
-; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
-; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], <i32 42, i32 42>
+; SSE-NEXT:    [[C2:%.*]] = icmp slt i32 [[TMP1]], 42
+; SSE-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
 ; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
 ; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
-; SSE-NEXT:    [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false
-; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; SSE-NEXT:    ret i1 [[S7]]
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; SSE-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP9]], i1 [[TMP10]], i1 false
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; SSE-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP11]], i1 false
+; SSE-NEXT:    [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
+; SSE-NEXT:    ret i1 [[OP_RDX2]]
 ;
 ; AVX-LABEL: @logical_and_icmp_clamp_partial(
-; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; AVX-NEXT:    [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
-; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; AVX-NEXT:    ret i1 [[S7]]
+; AVX-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[TMP3]], 42
+; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[TMP2]], 42
+; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[TMP1]], 42
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; AVX-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; AVX-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[C1]], i1 false
+; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C0]], i1 false
+; AVX-NEXT:    [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
+; AVX-NEXT:    ret i1 [[OP_RDX2]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -499,44 +442,17 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_pred_
diff (<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_clamp_pred_
diff (
-; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; SSE-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; SSE-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; SSE-NEXT:    [[S4:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
-; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; SSE-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP8]], i1 false
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP9]], i1 false
-; SSE-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP10]], i1 false
-; SSE-NEXT:    ret i1 [[S7]]
-;
-; AVX-LABEL: @logical_and_icmp_clamp_pred_
diff (
-; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; AVX-NEXT:    [[C3:%.*]] = icmp ult i32 [[X3]], 42
-; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; AVX-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; AVX-NEXT:    ret i1 [[S7]]
+; CHECK-LABEL: @logical_and_icmp_clamp_pred_
diff (
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -566,8 +482,8 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 [[C]], i1 false
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false
-; CHECK-NEXT:    ret i1 [[OP_EXTRA]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -595,8 +511,8 @@ define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[C]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i1 true, i1 [[S3]]
-; CHECK-NEXT:    ret i1 [[OP_EXTRA]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[S3]]
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
index 6bee995030855..1fd2f30bf93cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
@@ -8,20 +8,21 @@ define i64 @test() {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP65]]
 ;
 bb1:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 362967fcc3c11..0fac440081208 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -24,15 +24,15 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -97,17 +97,17 @@ define i32 @test2(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q:%.*]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
   %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -188,18 +188,18 @@ define i32 @test3(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q:%.*]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[SHUFFLE]], [[TMP3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
   %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 28417c0ac0201..7e0dc28681e47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -11,9 +11,9 @@
 ; }
 
 ; Vector cost is 5, Scalar cost is 7
-; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; AVX: Adding cost -2 for reduction that starts with   %0 = load i32, i32* %p, align 4 (It is a splitting reduction)
 ; Vector cost is 6, Scalar cost is 7
-; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; SSE: Adding cost -1 for reduction that starts with   %0 = load i32, i32* %p, align 4 (It is a splitting reduction)
 define i32 @test_add(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_add(
 ; CHECK-NEXT:  entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
index 0e5d3bca07549..915fe01abdefe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -11,31 +11,26 @@ define void @hoge() {
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T:%.*]] = select i1 undef, i16 undef, i16 15
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> <i16 poison, i16 undef>, i16 [[T]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> <i16 undef, i16 poison>, i16 [[T]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef
-; CHECK-NEXT:    [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], <i32 undef, i32 15, i32 31, i32 47>
+; CHECK-NEXT:    [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], <i32 15, i32 undef, i32 31, i32 47>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef
 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i32> undef, [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP9]], undef
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP9]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = select i1 [[OP_EXTRA4]], i32 [[OP_EXTRA3]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[OP_EXTRA6]], i32 [[OP_EXTRA5]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = icmp slt i32 [[OP_EXTRA7]], undef
-; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = select i1 [[OP_EXTRA8]], i32 [[OP_EXTRA7]], i32 undef
-; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -17, i32 -33, i32 -33, i32 -49>
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef
+; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]]
 ; CHECK-NEXT:    unreachable
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
index a9a6aa937171b..e46169a83b97a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
@@ -21,19 +21,19 @@ define void @test() {
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8
-; CHECK-NEXT:    [[I5:%.*]] = add i32 undef, undef
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[TMP2]], [[I5]]
-; CHECK-NEXT:    [[I10:%.*]] = add i32 [[OP_EXTRA2]], undef
-; CHECK-NEXT:    [[I11:%.*]] = add i32 [[OP_EXTRA2]], [[I10]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[TMP2]], undef
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], undef
+; CHECK-NEXT:    [[I10:%.*]] = add i32 [[OP_RDX4]], undef
+; CHECK-NEXT:    [[I11:%.*]] = add i32 [[OP_RDX4]], [[I10]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[I12:%.*]] = add i32 undef, undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP5]], [[I12]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef
-; CHECK-NEXT:    [[I18:%.*]] = add i32 [[OP_EXTRA1]], [[I11]]
-; CHECK-NEXT:    [[I19:%.*]] = add i32 [[OP_EXTRA1]], [[I18]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP5]], undef
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], undef
+; CHECK-NEXT:    [[I18:%.*]] = add i32 [[OP_RDX2]], [[I11]]
+; CHECK-NEXT:    [[I19:%.*]] = add i32 [[OP_RDX2]], [[I18]]
 ; CHECK-NEXT:    [[I20:%.*]] = add i32 undef, [[I19]]
 ; CHECK-NEXT:    [[I21:%.*]] = add i32 undef, [[I20]]
 ; CHECK-NEXT:    [[I22:%.*]] = add i32 undef, [[I21]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
index 9502620a6b494..654d6881d03e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -7,8 +7,8 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
@@ -53,9 +53,9 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 1, i32 0, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 2, i32 1, i32 3, i32 1, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
@@ -104,9 +104,9 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 3, i32 2, i32 3, i32 0, i32 1, i32 0, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 2, i32 3, i32 3, i32 0, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index f48d5e27e8aa6..b2ae15cf6166f 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> poison, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)