[llvm] [SLP]Represent SLP graph as a tree (PR #126771)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 19 10:03:07 PST 2025


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/126771

>From f2e3a1f7c4d481ed935eaec1e05d08ce8b140bc8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 11 Feb 2025 17:58:20 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 633 +++++++++++-------
 .../SLPVectorizer/AArch64/horizontal.ll       |   4 +-
 .../SLPVectorizer/AArch64/remarks.ll          |   2 +-
 .../AArch64/reorder-fmuladd-crash.ll          |   9 +-
 .../RISCV/remarks_cmp_sel_min_max.ll          |   8 +-
 .../SystemZ/SLP-cmp-cost-query.ll             |   2 +-
 .../cmp-after-intrinsic-call-minbitwidth.ll   |  12 +-
 .../X86/full-matched-bv-with-subvectors.ll    |  18 +-
 .../X86/gather-node-same-as-vect-but-order.ll |  15 +-
 .../SLPVectorizer/X86/geps-non-pow-2.ll       |   6 +-
 .../X86/matching-gather-nodes-phi-users.ll    |   2 +-
 .../X86/minbitwidth-icmp-to-trunc.ll          |  17 +-
 .../X86/minbw-node-used-twice.ll              |   6 +-
 .../X86/reused-mask-with-poison-index.ll      |  18 +-
 .../SLPVectorizer/X86/shrink_after_reorder.ll |   5 +-
 .../X86/slp-schedule-use-order.ll             |   6 +-
 .../subvector-minbitwidth-unsigned-value.ll   |   8 +-
 17 files changed, 463 insertions(+), 308 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1c08077126db..6f9c62c6218b2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1409,12 +1409,6 @@ class BoUpSLP {
   /// Construct a vectorizable tree that starts at \p Roots.
   void buildTree(ArrayRef<Value *> Roots);
 
-  /// Returns whether the root node has in-tree uses.
-  bool doesRootHaveInTreeUses() const {
-    return !VectorizableTree.empty() &&
-           !VectorizableTree.front()->UserTreeIndices.empty();
-  }
-
   /// Return the scalars of the root node.
   ArrayRef<Value *> getRootNodeScalars() const {
     assert(!VectorizableTree.empty() && "No graph to get the first node from");
@@ -1524,7 +1518,12 @@ class BoUpSLP {
   /// shuffled vector entry + (possibly) permutation with other gathers. It
   /// implements the checks only for possibly ordered scalars (Loads,
   /// ExtractElement, ExtractValue), which can be part of the graph.
-  std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+  /// \param TopToBottom If true, used for the whole tree rotation, false - for
+  /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
+  /// node might be ignored.
+  std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
+                                                     bool TopToBottom,
+                                                     bool IgnoreReorder);
 
   /// Sort loads into increasing pointers offsets to allow greater clustering.
   std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
@@ -1536,8 +1535,14 @@ class BoUpSLP {
   /// identity order is important, or the actual order.
   /// \param TopToBottom If true, include the order of vectorized stores and
   /// insertelement nodes, otherwise skip them.
-  std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
-                                              bool TopToBottom);
+  /// \param IgnoreReorder true, if the root node order can be ignored.
+  std::optional<OrdersType>
+  getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
+
+  /// Checks if it is profitable to reorder the current tree.
+  /// If the tree does not contain many profitable reordable nodes, better to
+  /// skip it to save compile time.
+  bool isProfitableToReorder() const;
 
   /// Reorders the current graph to the most profitable order starting from the
   /// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1680,6 +1685,8 @@ class BoUpSLP {
     bool operator == (const EdgeInfo &Other) const {
       return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
     }
+
+    operator bool() const { return UserTE != nullptr; }
   };
 
   /// A helper class used for scoring candidates for two consecutive lanes.
@@ -2999,8 +3006,10 @@ class BoUpSLP {
     ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
     TreeEntry *TE = nullptr;
     const auto *It = find_if(VL, [&](Value *V) {
+      if (!isa<Instruction>(V))
+        return false;
       for (TreeEntry *E : getTreeEntries(V)) {
-        if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+        if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) {
           TE = E;
           return true;
         }
@@ -3031,7 +3040,7 @@ class BoUpSLP {
   /// of a vector of (the same) instruction.
   TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
 
-  /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+  /// \returns the graph entry for the \p Idx operand of the \p E entry.
   const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
 
   /// Gets the root instruction for the given node. If the node is a strided
@@ -3070,10 +3079,15 @@ class BoUpSLP {
 
   /// Returns vectorized operand node, that matches the order of the scalars
   /// operand number \p NodeIdx in entry \p E.
-  TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
-  const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
-                                               unsigned NodeIdx) const {
-    return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
+  TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+                                         ArrayRef<Value *> VL,
+                                         const InstructionsState &S);
+  const TreeEntry *
+  getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+                              ArrayRef<Value *> VL,
+                              const InstructionsState &S) const {
+    return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx,
+                                                                    VL, S);
   }
 
   /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
@@ -3249,9 +3263,8 @@ class BoUpSLP {
     }
 
     bool isOperandGatherNode(const EdgeInfo &UserEI) const {
-      return isGather() && !UserTreeIndices.empty() &&
-             UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
-             UserTreeIndices.front().UserTE == UserEI.UserTE;
+      return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx &&
+             UserTreeIndex.UserTE == UserEI.UserTE;
     }
 
     /// \returns true if current entry has same operands as \p TE.
@@ -3335,7 +3348,7 @@ class BoUpSLP {
 
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
-    SmallVector<EdgeInfo, 1> UserTreeIndices;
+    EdgeInfo UserTreeIndex;
 
     /// The index of this treeEntry in VectorizableTree.
     unsigned Idx = 0;
@@ -3559,9 +3572,9 @@ class BoUpSLP {
       for (unsigned ReorderIdx : ReorderIndices)
         dbgs() << ReorderIdx << ", ";
       dbgs() << "\n";
-      dbgs() << "UserTreeIndices: ";
-      for (const auto &EInfo : UserTreeIndices)
-        dbgs() << EInfo << ", ";
+      dbgs() << "UserTreeIndex: ";
+      if (UserTreeIndex)
+        dbgs() << UserTreeIndex;
       dbgs() << "\n";
       if (!CombinedEntriesWithIndices.empty()) {
         dbgs() << "Combined entries: ";
@@ -3707,7 +3720,7 @@ class BoUpSLP {
     }
 
     if (UserTreeIdx.UserTE)
-      Last->UserTreeIndices.push_back(UserTreeIdx);
+      Last->UserTreeIndex = UserTreeIdx;
     return Last;
   }
 
@@ -4463,11 +4476,11 @@ template <> struct GraphTraits<BoUpSLP *> {
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
-    return {N->UserTreeIndices.begin(), N->Container};
+    return {&N->UserTreeIndex, N->Container};
   }
 
   static ChildIteratorType child_end(NodeRef N) {
-    return {N->UserTreeIndices.end(), N->Container};
+    return {&N->UserTreeIndex + 1, N->Container};
   }
 
   /// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -4632,7 +4645,8 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
 }
 
 std::optional<BoUpSLP::OrdersType>
-BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
+                                  bool TopToBottom, bool IgnoreReorder) {
   assert(TE.isGather() && "Expected gather node only.");
   // Try to find subvector extract/insert patterns and reorder only such
   // patterns.
@@ -4658,6 +4672,26 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   if (GatherShuffles.size() == 1 &&
       *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
       Entries.front().front()->isSame(TE.Scalars)) {
+    // If the full matched node in whole tree rotation - no need to consider the
+    // matching order, rotating the whole tree.
+    if (TopToBottom)
+      return std::nullopt;
+    // No need to keep the order for the same user node.
+    if (Entries.front().front()->UserTreeIndex.UserTE ==
+        TE.UserTreeIndex.UserTE)
+      return std::nullopt;
+    // No need to keep the order for the matched root node, if it can be freely
+    // reordered.
+    if (!IgnoreReorder && Entries.front().front()->Idx == 0)
+      return std::nullopt;
+    // If shuffling 2 elements only and the matching node has reverse reuses -
+    // no need to count order, both work fine.
+    if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
+        TE.getVectorFactor() == 2 && Mask.size() == 2 &&
+        any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
+               [](const auto &P) { return P.value() % 2 != P.index() % 2; }))
+      return std::nullopt;
+
     // Perfect match in the graph, will reuse the previously vectorized
     // node. Cost is 0.
     std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
@@ -5557,7 +5591,8 @@ static bool areTwoInsertFromSameBuildVector(
 }
 
 std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
+                           bool IgnoreReorder) {
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
@@ -5577,7 +5612,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     unsigned Sz = TE.Scalars.size();
     if (TE.isGather()) {
       if (std::optional<OrdersType> CurrentOrder =
-              findReusedOrderedScalars(TE)) {
+              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
         SmallVector<int> Mask;
         fixupOrderingIndices(*CurrentOrder);
         inversePermutation(*CurrentOrder, Mask);
@@ -5680,10 +5715,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
-      any_of(TE.UserTreeIndices,
-             [](const EdgeInfo &EI) {
-               return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
-             }) &&
+      (!TE.UserTreeIndex ||
+       !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
       (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
     return std::nullopt;
   if ((TE.State == TreeEntry::Vectorize ||
@@ -5873,7 +5906,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
     // has been auditted for correctness with non-power-of-two vectors.
     if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
-      if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+      if (std::optional<OrdersType> CurrentOrder =
+              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
         return CurrentOrder;
   }
   return std::nullopt;
@@ -5943,6 +5977,67 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
   }
 }
 
+bool BoUpSLP::isProfitableToReorder() const {
+  constexpr unsigned TinyVF = 2;
+  constexpr unsigned TinyTree = 10;
+  if (VectorizableTree.size() <= TinyTree)
+    return true;
+  if (VectorizableTree.front()->hasState() &&
+      !VectorizableTree.front()->isGather() &&
+      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.front()->getOpcode() == Instruction::PHI ||
+       (VectorizableTree.front()->getVectorFactor() == TinyVF &&
+        (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
+         VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
+      VectorizableTree.front()->ReorderIndices.empty()) {
+    constexpr unsigned PhiOpsLimit = 12;
+    // Check if the tree has only single store and single (unordered) load node,
+    // other nodes are phis or geps/binops, combined with phis, and/orsingle
+    // gather load node
+    bool HasPhis = false;
+    if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+        VectorizableTree.front()->Scalars.size() == TinyVF &&
+        VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+      return false;
+    bool HasLoad = true;
+    unsigned GatherLoads = 0;
+    for (const std::unique_ptr<TreeEntry> &TE :
+         ArrayRef(VectorizableTree).drop_front()) {
+      if (!TE->hasState()) {
+        if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
+            all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
+          continue;
+        if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+            any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
+          continue;
+        return true;
+      }
+      if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
+        if (!TE->isGather()) {
+          HasLoad = false;
+          continue;
+        }
+        if (HasLoad)
+          return true;
+        ++GatherLoads;
+        if (GatherLoads >= 2)
+          return true;
+      }
+      if (TE->getOpcode() == Instruction::GetElementPtr ||
+          Instruction::isBinaryOp(TE->getOpcode()))
+        continue;
+      if (TE->getOpcode() != Instruction::PHI)
+        return true;
+      if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+          TE->getNumOperands() > PhiOpsLimit)
+        return false;
+      HasPhis = true;
+    }
+    return !HasPhis;
+  }
+  return true;
+}
+
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
   DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -5991,8 +6086,12 @@ void BoUpSLP::reorderTopToBottom() {
       // TODO: Check the reverse order too.
     }
 
+    bool IgnoreReorder =
+        !UserIgnoreList && VectorizableTree.front()->hasState() &&
+        (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
+         VectorizableTree.front()->getOpcode() == Instruction::Store);
     if (std::optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE, /*TopToBottom=*/true)) {
+            getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
       // Do not include ordering for nodes used in the alt opcode vectorization,
       // better to reorder them during bottom-to-top stage. If follow the order
       // here, it causes reordering of the whole graph though actually it is
@@ -6003,14 +6102,13 @@ void BoUpSLP::reorderTopToBottom() {
       unsigned Cnt = 0;
       const TreeEntry *UserTE = TE.get();
       while (UserTE && Cnt < RecursionMaxDepth) {
-        if (UserTE->UserTreeIndices.size() != 1)
+        if (!UserTE->UserTreeIndex)
           break;
-        if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
-              return EI.UserTE->State == TreeEntry::Vectorize &&
-                     EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
-            }))
+        if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+            UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
+            UserTE->UserTreeIndex.UserTE->Idx != 0)
           return;
-        UserTE = UserTE->UserTreeIndices.back().UserTE;
+        UserTE = UserTE->UserTreeIndex.UserTE;
         ++Cnt;
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
@@ -6156,12 +6254,10 @@ void BoUpSLP::reorderTopToBottom() {
           // Need to reorder the reuses masks of the operands with smaller VF to
           // be able to find the match between the graph nodes and scalar
           // operands of the given node during vectorization/cost estimation.
-          assert(all_of(TE->UserTreeIndices,
-                        [VF, &TE](const EdgeInfo &EI) {
-                          return EI.UserTE->Scalars.size() == VF ||
-                                 EI.UserTE->Scalars.size() ==
-                                     TE->Scalars.size();
-                        }) &&
+          assert((!TE->UserTreeIndex ||
+                  TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
+                  TE->UserTreeIndex.UserTE->Scalars.size() ==
+                      TE->Scalars.size()) &&
                  "All users must be of VF size.");
           if (SLPReVec) {
             assert(SLPReVec && "Only supported by REVEC.");
@@ -6169,15 +6265,11 @@ void BoUpSLP::reorderTopToBottom() {
             // because ShuffleVectorInst supports only a limited set of
             // patterns). Only do reorderNodeWithReuses if all of the users are
             // not ShuffleVectorInst.
-            if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-                  return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
-                }))
+            if (isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
               continue;
-            assert(none_of(TE->UserTreeIndices,
-                           [&](const EdgeInfo &EI) {
-                             return isa<ShuffleVectorInst>(
-                                 EI.UserTE->getMainOp());
-                           }) &&
+            assert((!TE->UserTreeIndex ||
+                    !isa<ShuffleVectorInst>(
+                        TE->UserTreeIndex.UserTE->getMainOp())) &&
                    "Does not know how to reorder.");
           }
           // Update ordering of the operands with the smaller VF than the given
@@ -6232,10 +6324,6 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
-      // Do not reorder if operand node is used by many user nodes.
-      if (any_of(TE->UserTreeIndices,
-                 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
-        return false;
       // Add the node to the list of the ordered nodes with the identity
       // order.
       Edges.emplace_back(I, TE);
@@ -6256,10 +6344,8 @@ bool BoUpSLP::canReorderOperands(
                    assert(TE->State != TreeEntry::Vectorize &&
                           TE->State != TreeEntry::StridedVectorize &&
                           "Only non-vectorized nodes are expected.");
-                   if (any_of(TE->UserTreeIndices,
-                              [UserTE, I](const EdgeInfo &EI) {
-                                return EI.UserTE == UserTE && EI.EdgeIdx == I;
-                              })) {
+                   if (TE->UserTreeIndex.UserTE == UserTE &&
+                       TE->UserTreeIndex.EdgeIdx == I) {
                      assert(TE->isSame(UserTE->getOperand(I)) &&
                             "Operand entry does not match operands.");
                      Gather = TE;
@@ -6286,8 +6372,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     if (TE->State != TreeEntry::Vectorize &&
         TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
-    if (std::optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE, /*TopToBottom=*/false)) {
+    if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+            *TE, /*TopToBottom=*/false, IgnoreReorder)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize) ||
@@ -6300,29 +6386,24 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   // I.e., if the node has operands, that are reordered, try to make at least
   // one operand order in the natural order and reorder others + reorder the
   // user node itself.
-  SmallPtrSet<const TreeEntry *, 4> Visited;
+  SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
   while (!OrderedEntries.empty()) {
     // 1. Filter out only reordered nodes.
-    // 2. If the entry has multiple uses - skip it and jump to the next node.
     DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
             (TE->isGather() && GathersToOrders.contains(TE))) ||
-          TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
-          !all_of(drop_begin(TE->UserTreeIndices),
-                  [TE](const EdgeInfo &EI) {
-                    return EI.UserTE == TE->UserTreeIndices.front().UserTE;
-                  }) ||
+          !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
           !Visited.insert(TE).second) {
         Filtered.push_back(TE);
         continue;
       }
       // Build a map between user nodes and their operands order to speedup
       // search. The graph currently does not provide this dependency directly.
-      for (EdgeInfo &EI : TE->UserTreeIndices)
-        Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
+      Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx,
+                                                   TE);
     }
     // Erase filtered entries.
     for (TreeEntry *TE : Filtered)
@@ -6360,7 +6441,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
           continue;
         const auto Order = [&]() -> const OrdersType {
           if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
-            return getReorderingData(*OpTE, /*TopToBottom=*/false)
+            return getReorderingData(*OpTE, /*TopToBottom=*/false,
+                                     IgnoreReorder)
                 .value_or(OrdersType(1));
           return OpTE->ReorderIndices;
         }();
@@ -6368,6 +6450,86 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         // orders.
         if (Order.size() == 1)
           continue;
+
+        // Check that the reordering does not increase number of shuffles, i.e.
+        // same-values-nodes has same parents or their parents has same parents.
+        if (!Order.empty() && !isIdentityOrder(Order)) {
+          Value *Root = OpTE->hasState()
+                            ? OpTE->getMainOp()
+                            : *find_if_not(OpTE->Scalars, isConstant);
+          auto GetSameNodesUsers = [&](Value *Root) {
+            SmallSetVector<TreeEntry *, 4> Res;
+            for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
+              if (TE != OpTE && TE->UserTreeIndex &&
+                  TE->getVectorFactor() == OpTE->getVectorFactor() &&
+                  TE->Scalars.size() == OpTE->Scalars.size() &&
+                  ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
+                   (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
+                Res.insert(TE->UserTreeIndex.UserTE);
+            }
+            for (const TreeEntry *TE : getTreeEntries(Root)) {
+              if (TE != OpTE && TE->UserTreeIndex &&
+                  TE->getVectorFactor() == OpTE->getVectorFactor() &&
+                  TE->Scalars.size() == OpTE->Scalars.size() &&
+                  ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
+                   (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
+                Res.insert(TE->UserTreeIndex.UserTE);
+            }
+            return Res.takeVector();
+          };
+          auto GetNumOperands = [](const TreeEntry *TE) {
+            if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
+              return CI->arg_size();
+            return TE->getNumOperands();
+          };
+          auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
+                                                       const TreeEntry *TE) {
+            Intrinsic::ID ID = Intrinsic::not_intrinsic;
+            if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
+              ID = getVectorIntrinsicIDForCall(CI, TLI);
+            for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
+              if (ID != Intrinsic::not_intrinsic &&
+                  isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
+                continue;
+              const TreeEntry *Op = getOperandEntry(TE, Idx);
+              if (Op->isGather() && Op->hasState()) {
+                const TreeEntry *VecOp =
+                    getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
+                if (VecOp)
+                  Op = VecOp;
+              }
+              if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
+                return false;
+            }
+            return true;
+          };
+          SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
+          if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
+                if (!RevisitedOps.insert(UTE).second)
+                  return false;
+                return UTE == Data.first || !UTE->ReorderIndices.empty() ||
+                       !UTE->ReuseShuffleIndices.empty() ||
+                       (UTE->UserTreeIndex &&
+                        UTE->UserTreeIndex.UserTE == Data.first) ||
+                       (Data.first->UserTreeIndex &&
+                        Data.first->UserTreeIndex.UserTE == UTE) ||
+                       NodeShouldBeReorderedWithOperands(UTE);
+              }))
+            continue;
+          for (TreeEntry *UTE : Users) {
+            Intrinsic::ID ID = Intrinsic::not_intrinsic;
+            if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
+              ID = getVectorIntrinsicIDForCall(CI, TLI);
+            for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
+              if (ID != Intrinsic::not_intrinsic &&
+                  isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
+                continue;
+              const TreeEntry *Op = getOperandEntry(UTE, Idx);
+              Visited.erase(Op);
+              OrderedEntries.insert(const_cast<TreeEntry *>(Op));
+            }
+          }
+        }
         unsigned NumOps = count_if(
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
@@ -6399,15 +6561,16 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return true;
           if (TE->isGather()) {
             if (GathersToOrders.contains(TE))
-              return !getReorderingData(*TE, /*TopToBottom=*/false)
+              return !getReorderingData(*TE, /*TopToBottom=*/false,
+                                        IgnoreReorder)
                           .value_or(OrdersType(1))
                           .empty();
             return true;
           }
           return false;
         };
-        for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
-          TreeEntry *UserTE = EI.UserTE;
+        if (OpTE->UserTreeIndex) {
+          TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
           if (!VisitedUsers.insert(UserTE).second)
             continue;
           // May reorder user node if it requires reordering, has reused
@@ -6425,10 +6588,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
                   Ops, [UserTE, &AllowsReordering](
                            const std::pair<unsigned, TreeEntry *> &Op) {
                     return AllowsReordering(Op.second) &&
-                           all_of(Op.second->UserTreeIndices,
-                                  [UserTE](const EdgeInfo &EI) {
-                                    return EI.UserTE == UserTE;
-                                  });
+                           Op.second->UserTreeIndex.UserTE == UserTE;
                   })) <= Ops.size() / 2)
             ++Res.first->second;
         }
@@ -8253,10 +8413,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
     for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
       if (E->isSame(VL)) {
-        // Record the reuse of the tree node.
-        E->UserTreeIndices.push_back(UserTreeIdx);
         LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
                           << ".\n");
+        if (TryToFindDuplicates(S))
+          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndices);
         return;
       }
       SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end());
@@ -9948,18 +10109,15 @@ void BoUpSLP::transformNodes() {
         };
         for (auto [Cnt, Sz] : Slices) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
-          // If any instruction is vectorized already - do not try again.
-          if (TreeEntry *SE = getSameValuesTreeEntry(Slice.front(), Slice,
-                                                     /*SameVF=*/true)) {
-            SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
-            AddCombinedNode(SE->Idx, Cnt, Sz);
-            continue;
-          }
+          InstructionsState S = getSameOpcode(Slice, *TLI);
+          const TreeEntry *SameTE =
+              S ? getSameValuesTreeEntry(S.getMainOp(), Slice, /*SameVF=*/true)
+                : nullptr;
           unsigned PrevSize = VectorizableTree.size();
           [[maybe_unused]] unsigned PrevEntriesSize =
               LoadEntriesToVectorize.size();
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
-          if (PrevSize + 1 == VectorizableTree.size() &&
+          if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
               VectorizableTree[PrevSize]->isGather() &&
               VectorizableTree[PrevSize]->hasState() &&
               VectorizableTree[PrevSize]->getOpcode() !=
@@ -10073,7 +10231,7 @@ void BoUpSLP::transformNodes() {
       // This node is a minmax node.
       E.CombinedOp = TreeEntry::MinMax;
       TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
-      if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
+      if (SelectOnly && CondEntry->UserTreeIndex &&
           CondEntry->State == TreeEntry::Vectorize) {
         // The condition node is part of the combined minmax node.
         CondEntry->State = TreeEntry::CombinedVectorize;
@@ -11063,15 +11221,30 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
 
 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
                                                    unsigned Idx) const {
-  if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
+  ArrayRef<Value *> VL = E->getOperand(Idx);
+  InstructionsState S = getSameOpcode(VL, *TLI);
+  // Special processing for GEPs bundle, which may include non-gep values.
+  if (!S && VL.front()->getType()->isPointerTy()) {
+    const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
+    if (It != VL.end())
+      S = getSameOpcode(*It, *TLI);
+  }
+  if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S))
     return VE;
-  const auto *It =
-      find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->isGather() &&
-               find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-                 return EI.EdgeIdx == Idx && EI.UserTE == E;
-               }) != TE->UserTreeIndices.end();
-      });
+  if (S || !isConstant(VL.front())) {
+    for (const TreeEntry *VE :
+         ValueToGatherNodes.lookup(S ? S.getMainOp() : VL.front()))
+      if (VE->UserTreeIndex.EdgeIdx == Idx && VE->UserTreeIndex.UserTE == E) {
+        assert(VE->isSame(VL) && "Expected gather node with same values.");
+        return VE;
+      }
+  }
+  const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
+                           [&](const std::unique_ptr<TreeEntry> &TE) {
+                             return TE->isGather() &&
+                                    TE->UserTreeIndex.EdgeIdx == Idx &&
+                                    TE->UserTreeIndex.UserTE == E;
+                           });
   assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
   return It->get();
 }
@@ -11200,12 +11373,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         // resized.
         if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
             E->Idx != 0 &&
-            (E->getOpcode() != Instruction::Load ||
-             !E->UserTreeIndices.empty())) {
-          const EdgeInfo &EI =
-              *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
-                return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
-              });
+            (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
+          const EdgeInfo &EI = E->UserTreeIndex;
           if (EI.UserTE->getOpcode() != Instruction::Select ||
               EI.EdgeIdx != 0) {
             auto UserBWIt = MinBWs.find(EI.UserTE);
@@ -12506,7 +12675,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     // Exclude cost of gather loads nodes which are not used. These nodes were
     // built as part of the final attempt to vectorize gathered loads.
-    assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
+    assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
            "Expected gather nodes with users only.");
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
@@ -13109,7 +13278,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // reused elements too for better cost estimation.
   const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
                                 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
-                                : TE->UserTreeIndices.front();
+                                : TE->UserTreeIndex;
   const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
   const BasicBlock *TEInsertBlock = nullptr;
   // Main node of PHI entries keeps the correct order of operands/incoming
@@ -13163,20 +13332,36 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // have a permutation of 2 input vectors.
   SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
   DenseMap<Value *, int> UsedValuesEntry;
+  SmallPtrSet<const Value *, 16> VisitedValue;
+  auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
+    // The node is reused - exit.
+    if ((TEPtr->getVectorFactor() != VL.size() &&
+         TEPtr->Scalars.size() != VL.size()) ||
+        (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
+      return false;
+    UsedTEs.clear();
+    UsedTEs.emplace_back().insert(TEPtr);
+    for (Value *V : VL) {
+      if (isConstant(V))
+        continue;
+      UsedValuesEntry.try_emplace(V, 0);
+    }
+    return true;
+  };
   for (Value *V : VL) {
-    if (isConstant(V))
+    if (isConstant(V) || !VisitedValue.insert(V).second)
       continue;
     // Build a list of tree entries where V is used.
     SmallPtrSet<const TreeEntry *, 4> VToTEs;
-    for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
+    for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
       if (TEPtr == TE || TEPtr->Idx == 0)
         continue;
       assert(any_of(TEPtr->Scalars,
                     [&](Value *V) { return GatheredScalars.contains(V); }) &&
              "Must contain at least single gathered value.");
-      assert(TEPtr->UserTreeIndices.size() == 1 &&
+      assert(TEPtr->UserTreeIndex &&
              "Expected only single user of a gather node.");
-      const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
+      const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
 
       PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
       const Instruction *InsertPt =
@@ -13201,6 +13386,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
            TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
           !CheckOrdering(InsertPt))
         continue;
+      // The node is reused - exit.
+      if (CheckAndUseSameNode(TEPtr))
+        break;
       VToTEs.insert(TEPtr);
     }
     if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
@@ -13222,6 +13410,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
         if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
           continue;
       }
+      // The node is reused - exit.
+      if (CheckAndUseSameNode(VTE))
+        break;
       VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
@@ -13634,18 +13825,18 @@ BoUpSLP::isGatherShuffledEntry(
   if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
-  assert((TE->UserTreeIndices.size() == 1 ||
+  assert((TE->UserTreeIndex ||
           TE == VectorizableTree.front().get()) &&
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
-  if (!TE->UserTreeIndices.empty() &&
-      TE->UserTreeIndices.front().UserTE->isGather() &&
-      TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
+  if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
+      TE->UserTreeIndex.EdgeIdx == UINT_MAX) {
     assert(
         (TE->Idx == 0 ||
          (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
-         isSplat(TE->Scalars)) &&
+         isSplat(TE->Scalars) ||
+         getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars)) &&
         "Expected splat or extractelements only node.");
     return {};
   }
@@ -14272,11 +14463,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
                    (!UTEs.empty() &&
                     count_if(R.VectorizableTree,
                              [&](const std::unique_ptr<TreeEntry> &TE) {
-                               return any_of(TE->UserTreeIndices,
-                                             [&](const EdgeInfo &Edge) {
-                                               return Edge.UserTE ==
-                                                      UTEs.front();
-                                             }) &&
+                               return TE->UserTreeIndex.UserTE ==
+                                          UTEs.front() &&
                                       is_contained(VL, EI);
                              }) != 1);
           }))
@@ -14639,32 +14827,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
 };
 
-BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
-                                                         unsigned NodeIdx) {
-  ArrayRef<Value *> VL = E->getOperand(NodeIdx);
-  InstructionsState S = getSameOpcode(VL, *TLI);
-  // Special processing for GEPs bundle, which may include non-gep values.
-  if (!S && VL.front()->getType()->isPointerTy()) {
-    const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
-    if (It != VL.end())
-      S = getSameOpcode(*It, *TLI);
-  }
+BoUpSLP::TreeEntry *
+BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+                                     ArrayRef<Value *> VL,
+                                     const InstructionsState &S) {
   if (!S)
     return nullptr;
-  auto CheckSameVE = [&](const TreeEntry *VE) {
-    return any_of(VE->UserTreeIndices,
-                  [E, NodeIdx](const EdgeInfo &EI) {
-                    return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
-                  }) ||
-           any_of(VectorizableTree,
-                  [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
-                    return TE->isOperandGatherNode(
-                               {const_cast<TreeEntry *>(E), NodeIdx}) &&
-                           VE->isSame(TE->Scalars);
-                  });
-  };
-  TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
-  if (VE && CheckSameVE(VE))
+  if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
+      VE && VE->UserTreeIndex.UserTE == E &&
+      VE->UserTreeIndex.EdgeIdx == NodeIdx)
     return VE;
   return nullptr;
 }
@@ -14672,8 +14843,15 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
+  InstructionsState S = getSameOpcode(VL, *TLI);
+  // Special processing for GEPs bundle, which may include non-gep values.
+  if (!S && VL.front()->getType()->isPointerTy()) {
+    const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
+    if (It != VL.end())
+      S = getSameOpcode(*It, *TLI);
+  }
   const unsigned VF = VL.size();
-  if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
+  if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) {
     auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
       // V may be affected by MinBWs.
       // We want ShuffleInstructionBuilder to correctly support REVEC. The key
@@ -14738,14 +14916,13 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     // Need to update the operand gather node, if actually the operand is not a
     // vectorized node, but the buildvector/gather node, which matches one of
     // the vectorized nodes.
-    if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
-          return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
-        }) == VE->UserTreeIndices.end()) {
-      auto *It =
-          find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-            return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
-                   TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
-          });
+    if (VE->UserTreeIndex.UserTE != E || VE->UserTreeIndex.EdgeIdx != NodeIdx) {
+      auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
+                         [&](const std::unique_ptr<TreeEntry> &TE) {
+                           return TE->isGather() &&
+                                  TE->UserTreeIndex.UserTE == E &&
+                                  TE->UserTreeIndex.EdgeIdx == NodeIdx;
+                         });
       assert(It != VectorizableTree.end() && "Expected gather node operand.");
       (*It)->VectorizedValue = V;
     }
@@ -14755,12 +14932,12 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
   // Find the corresponding gather entry and vectorize it.
   // Allows to be more accurate with tree/graph transformations, checks for the
   // correctness of the transformations in many cases.
-  auto *I = find_if(VectorizableTree,
+  auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
                     [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
                       return TE->isOperandGatherNode({E, NodeIdx});
                     });
   assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
-  assert(I->get()->UserTreeIndices.size() == 1 &&
+  assert(I->get()->UserTreeIndex &&
          "Expected only single user for the gather node.");
   assert(I->get()->isSame(VL) && "Expected same list of scalars.");
   return vectorizeTree(I->get(), PostponedPHIs);
@@ -14812,17 +14989,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           return isa<UndefValue>(V) && !isa<PoisonValue>(V);
         }))
       return false;
-    TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
-    unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+    TreeEntry *UserTE = E->UserTreeIndex.UserTE;
+    unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
     if (UserTE->getNumOperands() != 2)
       return false;
     if (!IsNotPoisonous) {
-      auto *It =
-          find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
-            return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
-                     return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
-                   }) != TE->UserTreeIndices.end();
-          });
+      auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
+                         [=](const std::unique_ptr<TreeEntry> &TE) {
+                           return TE->UserTreeIndex.UserTE == UserTE &&
+                                  TE->UserTreeIndex.EdgeIdx != EdgeIdx;
+                         });
       if (It == VectorizableTree.end())
         return false;
       SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
@@ -15036,15 +15212,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       auto *It = find_if(Scalars, [this, E](Value *V) {
         return !isa<UndefValue>(V) &&
                (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
-                (E->UserTreeIndices.size() == 1 &&
-                 any_of(V->uses(), [E](const Use &U) {
+                (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
                    // Check if the value already used in the same operation in
                    // one of the nodes already.
-                   return E->UserTreeIndices.front().EdgeIdx !=
-                              U.getOperandNo() &&
-                          is_contained(
-                              E->UserTreeIndices.front().UserTE->Scalars,
-                              U.getUser());
+                   return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
+                          is_contained(E->UserTreeIndex.UserTE->Scalars,
+                                       U.getUser());
                  })));
       });
       if (It != Scalars.end()) {
@@ -15356,8 +15529,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
-              E != VectorizableTree.front().get() ||
-              !E->UserTreeIndices.empty()) &&
+              E != VectorizableTree.front().get() || E->UserTreeIndex) &&
              "PHI reordering is free.");
       if (PostponedPHIs && E->VectorizedValue)
         return E->VectorizedValue;
@@ -16336,8 +16508,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (GatheredLoadsEntriesFirst.has_value() &&
         TE->Idx >= *GatheredLoadsEntriesFirst &&
-        (!TE->isGather() || !TE->UserTreeIndices.empty())) {
-      assert((!TE->UserTreeIndices.empty() ||
+        (!TE->isGather() || TE->UserTreeIndex)) {
+      assert((TE->UserTreeIndex ||
               (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
              "Expected gathered load node.");
       (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
@@ -16356,17 +16528,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
   DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
   for (const TreeEntry *E : PostponedNodes) {
     auto *TE = const_cast<TreeEntry *>(E);
-    if (auto *VecTE = getSameValuesTreeEntry(
-            TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
-                                     TE->UserTreeIndices.front().EdgeIdx));
-        VecTE && VecTE->isSame(TE->Scalars))
-      // Found gather node which is absolutely the same as one of the
-      // vectorized nodes. It may happen after reordering.
-      continue;
     auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
     TE->VectorizedValue = nullptr;
-    auto *UserI =
-        cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
+    auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
     // If user is a PHI node, its vector code have to be inserted right before
     // block terminator. Since the node was delayed, there were some unresolved
     // dependencies at the moment when stab instruction was emitted. In a case
@@ -16438,7 +16602,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
       }
       if (IsSigned.value_or(false)) {
         // Final attempt - check user node.
-        auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
+        auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
         if (It != MinBWs.end())
           IsSigned = It->second.second;
       }
@@ -16904,15 +17068,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     for (Instruction *I : RemovedInsts) {
       const TreeEntry *IE = getTreeEntries(I).front();
       if (IE->Idx != 0 &&
-          !(VectorizableTree.front()->isGather() &&
-            !IE->UserTreeIndices.empty() &&
+          !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
             (ValueToGatherNodes.lookup(I).contains(
                  VectorizableTree.front().get()) ||
-             any_of(IE->UserTreeIndices,
-                    [&](const EdgeInfo &EI) {
-                      return EI.UserTE == VectorizableTree.front().get() &&
-                             EI.EdgeIdx == UINT_MAX;
-                    }))) &&
+             (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
+              IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
           !(GatheredLoadsEntriesFirst.has_value() &&
             IE->Idx >= *GatheredLoadsEntriesFirst &&
             VectorizableTree.front()->isGather() &&
@@ -17820,6 +17980,18 @@ bool BoUpSLP::collectValuesToDemote(
         E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
     // Demote gathers.
     if (Res && E.isGather()) {
+      if (E.hasState()) {
+        if (const TreeEntry *SameTE =
+                getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
+            SameTE)
+          if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
+                                    ToDemote, Visited, NodesToKeepBWs,
+                                    MaxDepthLevel, IsProfitableToDemote,
+                                    IsTruncRoot)) {
+            ToDemote.push_back(E.Idx);
+            return true;
+          }
+      }
       // Check possible extractelement instructions bases and final vector
       // length.
       SmallPtrSet<Value *, 4> UniqueBases;
@@ -17832,13 +18004,15 @@ bool BoUpSLP::collectValuesToDemote(
       const unsigned VF = E.Scalars.size();
       Type *OrigScalarTy = E.Scalars.front()->getType();
       if (UniqueBases.size() <= 2 ||
-          ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
+          ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
               ::getNumberOfParts(
                   *TTI,
                   getWidenedType(
                       IntegerType::get(OrigScalarTy->getContext(), BitWidth),
-                      VF)))
+                      VF))) {
         ToDemote.push_back(E.Idx);
+        return true;
+      }
     }
     return Res;
   };
@@ -17914,12 +18088,6 @@ bool BoUpSLP::collectValuesToDemote(
           (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
                                               std::ref(BitWidth)));
         } else {
-          // Several vectorized uses? Check if we can truncate it, otherwise -
-          // exit.
-          if (E.UserTreeIndices.size() > 1 &&
-              !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
-                                           std::ref(BitWidth))))
-            return false;
           bool NeedToExit = false;
           if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
             return false;
@@ -18162,11 +18330,9 @@ void BoUpSLP::computeMinimumValueSizes() {
 
   // Ensure the roots of the vectorizable tree don't form a cycle.
   if (VectorizableTree[NodeIdx]->isGather() ||
-      (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
-      (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                              [NodeIdx](const EdgeInfo &EI) {
-                                return EI.UserTE->Idx > NodeIdx;
-                              })))
+      (NodeIdx == 0 && VectorizableTree[NodeIdx]->UserTreeIndex) ||
+      (NodeIdx != 0 &&
+       VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Idx > NodeIdx))
     return;
 
   // The first value node for store/insertelement is sext/zext/trunc? Skip it,
@@ -18196,7 +18362,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
-    if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
+    if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
         !NodesToKeepBWs.contains(E.Idx) &&
         E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
         all_of(E.Scalars, [&](Value *V) {
@@ -18204,7 +18370,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                  (!V->hasNUsesOrMore(UsesLimit) &&
                   none_of(V->users(), [&](User *U) {
                     ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
-                    const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
+                    const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
                     if (TEs.empty() || is_contained(TEs, UserTE))
                       return false;
                     if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
@@ -18224,7 +18390,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                   }));
         })) {
       ToDemote.push_back(E.Idx);
-      const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
+      const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
       auto It = MinBWs.find(UserTE);
       if (It != MinBWs.end())
         return It->second.first;
@@ -18440,29 +18606,26 @@ void BoUpSLP::computeMinimumValueSizes() {
       NodeIdx = NewIdx;
       IsTruncRoot =
           NodeIdx < VectorizableTree.size() &&
-          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [](const EdgeInfo &EI) {
-                   return EI.EdgeIdx == 0 &&
-                          EI.UserTE->getOpcode() == Instruction::Trunc &&
-                          !EI.UserTE->isAltShuffle();
-                 });
+          VectorizableTree[NodeIdx]->UserTreeIndex &&
+          VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+              Instruction::Trunc &&
+          !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(
-              VectorizableTree[NodeIdx]->UserTreeIndices,
-              [&](const EdgeInfo &EI) {
-                return (EI.UserTE->hasState() &&
-                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
-                       any_of(EI.UserTE->Scalars, [&](Value *V) {
-                         auto *IC = dyn_cast<ICmpInst>(V);
-                         return IC &&
-                                (IC->isSigned() ||
+          VectorizableTree[NodeIdx]->UserTreeIndex &&
+          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+              Instruction::ICmp &&
+          any_of(VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
+                 [&](Value *V) {
+                   auto *IC = dyn_cast<ICmpInst>(V);
+                   return IC && (IC->isSigned() ||
                                  !isKnownNonNegative(IC->getOperand(0),
                                                      SimplifyQuery(*DL)) ||
                                  !isKnownNonNegative(IC->getOperand(1),
                                                      SimplifyQuery(*DL)));
-                       });
-              });
+                 });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'
@@ -18657,8 +18820,10 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     Size = R.getCanonicalGraphSize();
     return false;
   }
-  R.reorderTopToBottom();
-  R.reorderBottomToTop();
+  if (R.isProfitableToReorder()) {
+    R.reorderTopToBottom();
+    R.reorderBottomToTop();
+  }
   R.transformNodes();
   R.buildExternalUses();
 
@@ -18715,7 +18880,7 @@ static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
                        return V + (P - Mean) * (P - Mean);
                      }) /
                  Num;
-  return Dev * 81 / (Mean * Mean) == 0;
+  return Dev * 96 / (Mean * Mean) == 0;
 }
 
 bool SLPVectorizerPass::vectorizeStores(
@@ -19251,10 +19416,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       R.buildTree(Ops);
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
-      R.reorderTopToBottom();
-      R.reorderBottomToTop(
-          /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
-          !R.doesRootHaveInTreeUses());
+      if (R.isProfitableToReorder()) {
+        R.reorderTopToBottom();
+        R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
+      }
       R.transformNodes();
       R.buildExternalUses();
 
@@ -20184,7 +20349,7 @@ class HorizontalReduction {
         }
         V.reorderTopToBottom();
         // No need to reorder the root node at all.
-        V.reorderBottomToTop(!V.doesRootHaveInTreeUses());
+        V.reorderBottomToTop(/*IgnoreReorder=*/true);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
         BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 7c5f9847db1f4..8341895ba8268 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -17,7 +17,7 @@ target triple = "aarch64--linux"
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
 ; YAML-NEXT:   - Cost:            '-19'
 ; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '8'
+; YAML-NEXT:   - TreeSize:        '10'
 
 define i32 @test_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
 ; CHECK-LABEL: @test_select(
@@ -230,7 +230,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
 ; YAML-NEXT:   - Cost:            '-41'
 ; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '10'
+; YAML-NEXT:   - TreeSize:        '12'
 
 define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
 ; CHECK-LABEL: @test_unrolled_select(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
index 219496fc1ac9b..dcdfc6efbfb92 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -7,7 +7,7 @@ define void @f(ptr %r, ptr %w) {
   %add0 = fadd double %f0, %f0
   %add1 = fadd double %f1, %f1
   %w1 = getelementptr inbounds double, ptr %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 4
   store double %add0, ptr %w, !dbg !9
   store double %add1, ptr %w1
   ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index f5e904467baa7..b1bd2546c26f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -15,7 +15,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    br label [[FOR_COND15:%.*]]
 ; CHECK:       for.end39:
-; CHECK-NEXT:    switch i32 %arg2, label [[DO_BODY:%.*]] [
+; CHECK-NEXT:    switch i32 [[ARG2:%.*]], label [[DO_BODY:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 ; CHECK-NEXT:      i32 1, label [[SW_BB195:%.*]]
 ; CHECK-NEXT:    ]
@@ -26,8 +26,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
@@ -39,7 +38,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
 ; CHECK:       for.cond15.1:
-; CHECK-NEXT:    br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
 ;
 entry:
   %conv = sitofp i32 undef to double
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
index 644d645b9dc88..ff182ae3f56de 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
@@ -10,7 +10,7 @@
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-1'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '7'
 define i32 @min_double(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: @min_double(
 ; CHECK-NEXT:  entry:
@@ -44,7 +44,7 @@ entry:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-2'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '7'
 define i32 @min_float(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: @min_float(
 ; CHECK-NEXT:  entry:
@@ -78,7 +78,7 @@ entry:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-1'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '7'
 define i32 @max_double(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: @max_double(
 ; CHECK-NEXT:  entry:
@@ -112,7 +112,7 @@ entry:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-2'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '7'
 define i32 @max_float(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: @max_float(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
index b80be40d9fc86..fc62f4d511041 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -31,6 +31,6 @@ define void @fun(ptr nocapture, i32 zeroext) local_unnamed_addr #0 {
 ._crit_edge:                                      ; preds = %.lr.ph
   ret void
 
-; CHECK: SLP: Adding cost -1 for bundle Idx: 3, n=2 [  %4 = icmp ult i32 %2, %1, ..]
+; CHECK: SLP: Adding cost -1 for bundle Idx: 4, n=2 [  %4 = icmp ult i32 %2, %1, ..]
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
index 9fa88084aaa0a..a05d4fdd6315b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
@@ -5,14 +5,12 @@ define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i2> @llvm.smin.v2i2(<2 x i2> zeroinitializer, <2 x i2> zeroinitializer)
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i2> zeroinitializer, <2 x i2> [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i2> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i2> [[TMP2]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = zext i2 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> zeroinitializer, <2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[ADD:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[ADD]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i2> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD45:%.*]] = zext i2 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD45:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ADD152:%.*]] = or i32 [[ADD45]], [[ADD]]
 ; CHECK-NEXT:    [[IDXPROM153:%.*]] = sext i32 [[ADD152]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX154:%.*]] = getelementptr i8, ptr null, i64 [[IDXPROM153]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
index 7576eb7a8f55e..878b2370bfd2a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
@@ -7,29 +7,29 @@ define i32 @test(i64 %l.549) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CONV3:%.*]] = sext i32 0 to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1
 ; CHECK-NEXT:    br label %[[IF_THEN19:.*]]
 ; CHECK:       [[P:.*]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    br i1 false, label %[[S:.*]], label %[[Q:.*]]
 ; CHECK:       [[Q]]:
 ; CHECK-NEXT:    [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP6]], i64 0)
 ; CHECK-NEXT:    br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]]
 ; CHECK:       [[LOR_LHS_FALSE]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]]
 ; CHECK:       [[R]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i64> [ [[TMP7]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
 ; CHECK-NEXT:    br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]]
 ; CHECK:       [[LAND_LHS_TRUE]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
 ; CHECK-NEXT:    br i1 false, label %[[Q]], label %[[S]]
 ; CHECK:       [[S]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP7]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[IF_THEN19]]
 ; CHECK:       [[IF_THEN19]]:
@@ -37,7 +37,7 @@ define i32 @test(i64 %l.549) {
 ; CHECK-NEXT:    [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1
-; CHECK-NEXT:    [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> [[TMP2]], i64 2)
 ; CHECK-NEXT:    br i1 false, label %[[R]], label %[[IF_END25]]
 ; CHECK:       [[IF_END25]]:
 ; CHECK-NEXT:    br i1 false, label %[[IF_END29]], label %[[P]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
index a7a92bad5e5c1..19c29be1ef384 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -11,21 +11,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[Y0]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[I7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP9]], <2 x float> [[TMP8]], i64 0)
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[RC21]], align 4
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       entry.if.end72_crit_edge:
 ; CHECK-NEXT:    br label [[IF_END72:%.*]]
@@ -48,7 +46,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
-; CHECK-NEXT:    store <4 x float> [[TMP26]], ptr [[RC21]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index 63b41627106e5..53bffe502f3da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -13,7 +13,7 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -24,7 +24,7 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:      i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
@@ -36,7 +36,7 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> splat (i64 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index 166c819098c8c..7960278d2b21d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -10,7 +10,7 @@
 ; YAML:   - String:          'Stores SLP vectorized with cost '
 ; YAML:   - Cost:            '-6'
 ; YAML:   - String:          ' and with tree size '
-; YAML:   - TreeSize:        '14'
+; YAML:   - TreeSize:        '16'
 ; YAML: ...
 
 ; Test that SLP cost modeling is able to match gathering tree
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index 58ea4f8da01a4..60c067e6555ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -18,22 +18,17 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = and <2 x i24> [[TMP7]], splat (i24 255)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], splat (i24 24)
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> splat (i24 23), <2 x i24> [[TMP8]]
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i24> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <2 x i32> [[TMP26]], splat (i32 254)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP13]], splat (i32 4)
-; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP15]], <2 x i8> splat (i8 2), <2 x i8> [[TMP23]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> splat (i32 2), <2 x i32> [[TMP26]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP14]], splat (i32 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP17]], <2 x i8> splat (i8 31), <2 x i8> [[TMP25]]
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP17]], <2 x i32> splat (i32 31), <2 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq <2 x i32> [[TMP16]], splat (i32 54)
-; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP27]], <2 x i8> splat (i8 53), <2 x i8> [[TMP18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i8> [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> splat (i32 53), <2 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[P1]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1
 ; CHECK-NEXT:    [[CMP210_NOT:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    ret i1 [[CMP210_NOT]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
index d3d7f21ee1003..55f2b238c07df 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
@@ -8,9 +8,9 @@ define i8 @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> <i16 poison, i16 0>, i16 [[SUB_I_I79_PEEL_I]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i16> [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[CONV13_I89_PEEL_I:%.*]] = zext i1 [[TMP5]] to i8
 ; CHECK-NEXT:    ret i8 [[CONV13_I89_PEEL_I]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
index ea6d96147c951..efd44f5a85664 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
@@ -14,9 +14,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ [[TMP7]], [[TMP5]] ], [ [[TMP61:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi float [ [[TMP6]], [[TMP5]] ], [ [[TMP62:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP63:%.*]], %[[TMP56]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP65:%.*]], %[[TMP56]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP67:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP68:%.*]], %[[TMP56]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP69:%.*]], %[[TMP56]] ]
@@ -37,15 +37,15 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x float> [[TMP32]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP33]], <4 x float> zeroinitializer, <4 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP25]], i32 0
-; CHECK-NEXT:    [[TMP36:%.*]] = fsub float [[TMP17]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = fsub float [[TMP15]], [[TMP37]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP49:%.*]] = fsub float [[TMP17]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP14]], i64 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <4 x float> [[TMP25]], <4 x float> poison, <2 x i32> <i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <2 x float> [[TMP39]], <2 x float> [[TMP40]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP36]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP49]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul <2 x float> [[TMP42]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP41]], <2 x float> [[TMP26]], <2 x float> [[TMP44]])
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP45]], i64 0
@@ -72,9 +72,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
 ; CHECK-NEXT:    [[TMP61]] = phi float [ [[TMP12]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP62]] = phi float [ [[TMP13]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP63]] = phi float [ [[TMP14]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP9]], %[[BB50]] ]
-; CHECK-NEXT:    [[TMP64]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ]
+; CHECK-NEXT:    [[TMP64]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP65]] = phi float [ [[TMP16]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP11]], %[[BB50]] ]
-; CHECK-NEXT:    [[TMP66]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
+; CHECK-NEXT:    [[TMP66]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP67]] = phi float [ [[TMP18]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP68]] = phi float [ [[TMP19]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP3]], %[[BB50]] ]
 ; CHECK-NEXT:    [[TMP69]] = phi float [ [[TMP20]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index 242d66fda569a..648f051db4a52 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -7,9 +7,8 @@ define void @wombat(ptr %ptr, ptr %ptr1) {
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
index de4358c47cfd0..eae0ed466b0c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
@@ -9,9 +9,9 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i1> [ [[TMP6:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[TMP1]]
 ; CHECK:       1:
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i8> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i8> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6]] = and <2 x i1> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    br label [[FOR_BODY]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
index 83e1bef8fa066..d07353798edc9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
@@ -11,11 +11,11 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i8> [[TMP3]], <i8 1, i8 -1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP30]], <2 x i32> [[TMP5]], i64 0)
 ; CHECK-NEXT:    [[TMP11:%.*]] = uitofp <4 x i32> [[TMP10]] to <4 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fdiv <4 x float> zeroinitializer, [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> poison, i1 [[V4]], i32 0



More information about the llvm-commits mailing list