[llvm] [WIP][SLP] Forest Vectorization for Wide Chains (PR #171917)

Ryan Buchner via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 30 11:16:21 PST 2025


https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/171917

>From 3540e7840bf0b7c92933c49d1afbe5d9ef80702b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:26:45 -0800
Subject: [PATCH 01/19] [SLP] Precommit test

---
 .../SLPVectorizer/RISCV/wide-stores.ll        | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
new file mode 100644
index 0000000000000..ab5befb17cb1c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=slp-vectorizer -mcpu=spacemit-x60 -mtriple=riscv64 -slp-threshold=-24 < %s | FileCheck %s
+
+define dso_local void @wide_gather(ptr noalias noundef writeonly captures(none) initializes((0, 64)) %x, ptr noalias noundef readonly captures(none) %y) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @wide_gather(
+; CHECK-SAME: ptr noalias noundef writeonly captures(none) initializes((0, 64)) [[X:%.*]], ptr noalias noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[Y]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 0, i64 48, i64 8, i64 16, i64 112, i64 24, i64 56, i64 64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 40, i64 72, i64 80, i64 88, i64 120, i64 104, i64 32, i64 96>
+; CHECK-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 64
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP6]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <8 x i64> [[TMP1]], splat (i64 1)
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP7]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARRAYIDX2_8]], align 8, !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %y, i64 48
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %y, i64 112
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %y, i64 56
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %y, i64 64
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %y, i64 40
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %y, i64 72
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %y, i64 80
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %y, i64 88
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %y, i64 120
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %y, i64 104
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %y, i64 32
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %y, i64 96
+
+  %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %x, i64 32
+  %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %x, i64 40
+  %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %x, i64 48
+  %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %x, i64 56
+  %arrayidx2.8 = getelementptr inbounds nuw i8, ptr %x, i64 64
+  %arrayidx2.9 = getelementptr inbounds nuw i8, ptr %x, i64 72
+  %arrayidx2.10 = getelementptr inbounds nuw i8, ptr %x, i64 80
+  %arrayidx2.11 = getelementptr inbounds nuw i8, ptr %x, i64 88
+  %arrayidx2.12 = getelementptr inbounds nuw i8, ptr %x, i64 96
+  %arrayidx2.13 = getelementptr inbounds nuw i8, ptr %x, i64 104
+  %arrayidx2.14 = getelementptr inbounds nuw i8, ptr %x, i64 112
+  %arrayidx2.15 = getelementptr inbounds nuw i8, ptr %x, i64 120
+
+  %0 = load i64, ptr %y, align 8, !tbaa !10
+  %1 = load i64, ptr %arrayidx.1 , align 8, !tbaa !10
+  %2 = load i64, ptr %arrayidx.2 , align 8, !tbaa !10
+  %3 = load i64, ptr %arrayidx.3 , align 8, !tbaa !10
+  %4 = load i64, ptr %arrayidx.4 , align 8, !tbaa !10
+  %5 = load i64, ptr %arrayidx.5 , align 8, !tbaa !10
+  %6 = load i64, ptr %arrayidx.6 , align 8, !tbaa !10
+  %7 = load i64, ptr %arrayidx.7 , align 8, !tbaa !10
+  %8 = load i64, ptr %arrayidx.8 , align 8, !tbaa !10
+  %9 = load i64, ptr %arrayidx.9 , align 8, !tbaa !10
+  %10 = load i64, ptr %arrayidx.10 , align 8, !tbaa !10
+  %11 = load i64, ptr %arrayidx.11 , align 8, !tbaa !10
+  %12 = load i64, ptr %arrayidx.12 , align 8, !tbaa !10
+  %13 = load i64, ptr %arrayidx.13 , align 8, !tbaa !10
+  %14 = load i64, ptr %arrayidx.14 , align 8, !tbaa !10
+  %15 = load i64, ptr %arrayidx.15 , align 8, !tbaa !10
+
+  %add = add nsw i64 %0, 1
+  %add.1 = add nsw i64 %1 , 1
+  %add.2 = add nsw i64 %2 , 1
+  %add.3 = add nsw i64 %3 , 1
+  %add.4 = add nsw i64 %4 , 1
+  %add.5 = add nsw i64 %5 , 1
+  %add.6 = add nsw i64 %6 , 1
+  %add.7 = add nsw i64 %7 , 1
+  %add.8 = add nsw i64 %8 , 1
+  %add.9 = add nsw i64 %9 , 1
+  %add.10 = add nsw i64 %10 , 1
+  %add.11 = add nsw i64 %11 , 1
+  %add.12 = add nsw i64 %12 , 1
+  %add.13 = add nsw i64 %13 , 1
+  %add.14 = add nsw i64 %14 , 1
+  %add.15 = add nsw i64 %15 , 1
+
+  store i64 %add, ptr %x, align 8, !tbaa !10
+  store i64 %add.1 , ptr %arrayidx2.1 , align 8, !tbaa !10
+  store i64 %add.2 , ptr %arrayidx2.2 , align 8, !tbaa !10
+  store i64 %add.3 , ptr %arrayidx2.3 , align 8, !tbaa !10
+  store i64 %add.4 , ptr %arrayidx2.4 , align 8, !tbaa !10
+  store i64 %add.5 , ptr %arrayidx2.5 , align 8, !tbaa !10
+  store i64 %add.6 , ptr %arrayidx2.6 , align 8, !tbaa !10
+  store i64 %add.7 , ptr %arrayidx2.7 , align 8, !tbaa !10
+  store i64 %add.8 , ptr %arrayidx2.8 , align 8, !tbaa !10
+  store i64 %add.9 , ptr %arrayidx2.9 , align 8, !tbaa !10
+  store i64 %add.10 , ptr %arrayidx2.10 , align 8, !tbaa !10
+  store i64 %add.11 , ptr %arrayidx2.11 , align 8, !tbaa !10
+  store i64 %add.12 , ptr %arrayidx2.12 , align 8, !tbaa !10
+  store i64 %add.13 , ptr %arrayidx2.13 , align 8, !tbaa !10
+  store i64 %add.14 , ptr %arrayidx2.14 , align 8, !tbaa !10
+  store i64 %add.15 , ptr %arrayidx2.15 , align 8, !tbaa !10
+  ret void
+}
+
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"long", !8, i64 0}
+;.
+; CHECK: [[LONG_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"long", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+;.

>From cab1359ef6100e878de4dfbe9bab42d0753e2de9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 11:06:11 -0800
Subject: [PATCH 02/19] [SLP][NFC] Move VecTreeTy from TreeEntry to BoUpSLP

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83faa89218bcd..0b011a9ef673b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1959,6 +1959,7 @@ class slpvectorizer::BoUpSLP {
   using StoreList = SmallVector<StoreInst *, 8>;
   using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
   using OrdersType = SmallVector<unsigned, 4>;
+  using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
@@ -3882,8 +3883,7 @@ class slpvectorizer::BoUpSLP {
 
   class TreeEntry {
   public:
-    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
-    TreeEntry(VecTreeTy &Container) : Container(Container) {}
+    TreeEntry(BoUpSLP::VecTreeTy &Container) : Container(Container) {}
 
     /// \returns Common mask for reorder indices and reused scalars.
     SmallVector<int> getCommonMask() const {
@@ -4482,7 +4482,7 @@ class slpvectorizer::BoUpSLP {
 
   /// -- Vectorization State --
   /// Holds all of the tree entries.
-  TreeEntry::VecTreeTy VectorizableTree;
+  VecTreeTy VectorizableTree;
 
 #ifndef NDEBUG
   /// Debug printer.
@@ -6114,7 +6114,7 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
   /// NodeRef has to be a pointer per the GraphWriter.
   using NodeRef = TreeEntry *;
 
-  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+  using ContainerTy = BoUpSLP::VecTreeTy;
 
   /// Add the VectorizableTree to the index iterator to be able to return
   /// TreeEntry pointers.

>From a5b38915fc3ef00c623707c107c0b5ecbaa9fce1 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 13:51:16 -0800
Subject: [PATCH 03/19] [SLP][NFC] Move cost analysis/vectorization outside of
 VectorizeStoreChain()

Will call VectorizeStoreChain() for each chain, only want to do the cost analysis once.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0b011a9ef673b..852f72d8d276e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23184,25 +23184,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   Size = R.getCanonicalGraphSize();
   if (S && S.getOpcode() == Instruction::Load)
     Size = 2; // cut off masked gather small trees
-  InstructionCost Cost = R.getTreeCost();
-
-  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
-  if (Cost < -SLPCostThreshold) {
-    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
-
-    using namespace ore;
-
-    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
-                                        cast<StoreInst>(Chain[0]))
-                     << "Stores SLP vectorized with cost " << NV("Cost", Cost)
-                     << " and with tree size "
-                     << NV("TreeSize", R.getTreeSize()));
-
-    R.vectorizeTree();
-    return true;
-  }
-
-  return false;
+  return true;
 }
 
 /// Checks if the quadratic mean deviation is less than 90% of the mean size.
@@ -23493,6 +23475,32 @@ bool SLPVectorizerPass::vectorizeStores(
               unsigned TreeSize;
               std::optional<bool> Res =
                   vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
+              if (Res && *Res) {
+                if (TreeSize) {
+                  InstructionCost Cost = R.getTreeCost();
+
+                  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
+                                    << " for VF=" << VF << "\n");
+                  if (Cost < -SLPCostThreshold) {
+                    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = "
+                                      << Cost << "\n");
+
+                    using namespace ore;
+
+                    R.getORE()->emit(
+                        OptimizationRemark(SV_NAME, "StoresVectorized",
+                                           cast<StoreInst>(Slice[0]))
+                        << "Stores SLP vectorized with cost "
+                        << NV("Cost", Cost) << " and with tree size "
+                        << NV("TreeSize", R.getTreeSize()));
+
+                    R.vectorizeTree();
+                  } else
+                    *Res = false;
+                } else
+                  *Res = false;
+              }
+
               if (!Res) {
                 // Update the range of non schedulable VFs for slices starting
                 // at SliceStartIdx.

>From 6743f0d8385621a194e80b984e6d7cf58a62afd9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 11:30:04 -0800
Subject: [PATCH 04/19] [SLP][NFC] Make BoUpSLP::VectorizableTree hold multiple
 trees at once

Currently only work on one at a time so always just use the last tree.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 610 ++++++++++--------
 1 file changed, 328 insertions(+), 282 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 852f72d8d276e..ee4aa787f4e67 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2019,14 +2019,15 @@ class slpvectorizer::BoUpSLP {
 
   /// Return the scalars of the root node.
   ArrayRef<Value *> getRootNodeScalars() const {
-    assert(!VectorizableTree.empty() && "No graph to get the first node from");
-    return VectorizableTree.front()->Scalars;
+    assert(!VectorizableTree.back().empty() &&
+           "No graph to get the first node from");
+    return VectorizableTree.back().front()->Scalars;
   }
 
   /// Returns the type/is-signed info for the root node in the graph without
   /// casting.
   std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
-    const TreeEntry &Root = *VectorizableTree.front();
+    const TreeEntry &Root = *VectorizableTree.back().front();
     if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
         !Root.Scalars.front()->getType()->isIntegerTy())
       return std::nullopt;
@@ -2045,24 +2046,28 @@ class slpvectorizer::BoUpSLP {
   /// Checks if the root graph node can be emitted with narrower bitwidth at
   /// codegen and returns it signedness, if so.
   bool isSignedMinBitwidthRootNode() const {
-    return MinBWs.at(VectorizableTree.front().get()).second;
+    return MinBWs.at(VectorizableTree.back().front().get()).second;
   }
 
   /// Returns reduction type after minbitdth analysis.
   FixedVectorType *getReductionType() const {
     if (ReductionBitWidth == 0 ||
-        !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
+        !VectorizableTree.back()
+             .front()
+             ->Scalars.front()
+             ->getType()
+             ->isIntegerTy() ||
         ReductionBitWidth >=
             DL->getTypeSizeInBits(
-                VectorizableTree.front()->Scalars.front()->getType()))
+                VectorizableTree.back().front()->Scalars.front()->getType()))
       return getWidenedType(
-          VectorizableTree.front()->Scalars.front()->getType(),
-          VectorizableTree.front()->getVectorFactor());
+          VectorizableTree.back().front()->Scalars.front()->getType(),
+          VectorizableTree.back().front()->getVectorFactor());
     return getWidenedType(
         IntegerType::get(
-            VectorizableTree.front()->Scalars.front()->getContext(),
+            VectorizableTree.back().front()->Scalars.front()->getContext(),
             ReductionBitWidth),
-        VectorizableTree.front()->getVectorFactor());
+        VectorizableTree.back().front()->getVectorFactor());
   }
 
   /// Builds external uses of the vectorized scalars, i.e. the list of
@@ -2108,7 +2113,7 @@ class slpvectorizer::BoUpSLP {
     TreeEntryToStridedPtrInfoMap.clear();
   }
 
-  unsigned getTreeSize() const { return VectorizableTree.size(); }
+  unsigned getTreeSize() const { return VectorizableTree.back().size(); }
 
   /// Returns the base graph size, before any transformations.
   unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
@@ -4361,9 +4366,10 @@ class slpvectorizer::BoUpSLP {
         S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
         !UserTreeIdx.UserTE)
       return nullptr;
-    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
-    TreeEntry *Last = VectorizableTree.back().get();
-    Last->Idx = VectorizableTree.size() - 1;
+    VectorizableTree.back().push_back(
+        std::make_unique<TreeEntry>(VectorizableTree.back()));
+    TreeEntry *Last = VectorizableTree.back().back().get();
+    Last->Idx = VectorizableTree.back().size() - 1;
     Last->State = EntryState;
     if (UserTreeIdx.UserTE)
       OperandsToTreeEntry.try_emplace(
@@ -4482,13 +4488,13 @@ class slpvectorizer::BoUpSLP {
 
   /// -- Vectorization State --
   /// Holds all of the tree entries.
-  VecTreeTy VectorizableTree;
+  SmallVector<VecTreeTy> VectorizableTree;
 
 #ifndef NDEBUG
   /// Debug printer.
   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
     for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
-      VectorizableTree[Id]->dump();
+      VectorizableTree.back()[Id]->dump();
       dbgs() << "\n";
     }
   }
@@ -6131,7 +6137,7 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
   };
 
   static NodeRef getEntryNode(BoUpSLP &R) {
-    return R.VectorizableTree[0].get();
+    return R.VectorizableTree.back()[0].get();
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
@@ -6159,14 +6165,14 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
   };
 
   static nodes_iterator nodes_begin(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.begin());
+    return nodes_iterator(R->VectorizableTree.back().begin());
   }
 
   static nodes_iterator nodes_end(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.end());
+    return nodes_iterator(R->VectorizableTree.back().end());
   }
 
-  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.back().size(); }
 };
 
 template <>
@@ -8054,30 +8060,31 @@ bool BoUpSLP::isProfitableToReorder() const {
   constexpr unsigned TinyTree = 10;
   constexpr unsigned PhiOpsLimit = 12;
   constexpr unsigned GatherLoadsLimit = 2;
-  if (VectorizableTree.size() <= TinyTree)
+  if (VectorizableTree.back().size() <= TinyTree)
     return true;
-  if (VectorizableTree.front()->hasState() &&
-      !VectorizableTree.front()->isGather() &&
-      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
-       VectorizableTree.front()->getOpcode() == Instruction::PHI ||
-       (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
-        (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
-         VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
-      VectorizableTree.front()->ReorderIndices.empty()) {
+  if (VectorizableTree.back().front()->hasState() &&
+      !VectorizableTree.back().front()->isGather() &&
+      (VectorizableTree.back().front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.back().front()->getOpcode() == Instruction::PHI ||
+       (VectorizableTree.back().front()->getVectorFactor() <= TinyVF &&
+        (VectorizableTree.back().front()->getOpcode() ==
+             Instruction::PtrToInt ||
+         VectorizableTree.back().front()->getOpcode() == Instruction::ICmp))) &&
+      VectorizableTree.back().front()->ReorderIndices.empty()) {
     // Check if the tree has only single store and single (unordered) load node,
     // other nodes are phis or geps/binops, combined with phis, and/or single
     // gather load node
-    if (VectorizableTree.front()->hasState() &&
-        VectorizableTree.front()->getOpcode() == Instruction::PHI &&
-        VectorizableTree.front()->Scalars.size() == TinyVF &&
-        VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+    if (VectorizableTree.back().front()->hasState() &&
+        VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
+        VectorizableTree.back().front()->Scalars.size() == TinyVF &&
+        VectorizableTree.back().front()->getNumOperands() > PhiOpsLimit)
       return false;
     // Single node, which require reorder - skip.
-    if (VectorizableTree.front()->hasState() &&
-        VectorizableTree.front()->getOpcode() == Instruction::Store &&
-        VectorizableTree.front()->ReorderIndices.empty()) {
-      const unsigned ReorderedSplitsCnt =
-          count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+    if (VectorizableTree.back().front()->hasState() &&
+        VectorizableTree.back().front()->getOpcode() == Instruction::Store &&
+        VectorizableTree.back().front()->ReorderIndices.empty()) {
+      const unsigned ReorderedSplitsCnt = count_if(
+          VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry> &TE) {
             return TE->State == TreeEntry::SplitVectorize &&
                    !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
                    TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
@@ -8085,7 +8092,8 @@ bool BoUpSLP::isProfitableToReorder() const {
           });
       if (ReorderedSplitsCnt <= 1 &&
           static_cast<unsigned>(count_if(
-              VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+              VectorizableTree.back(),
+              [&](const std::unique_ptr<TreeEntry> &TE) {
                 return ((!TE->isGather() &&
                          (TE->ReorderIndices.empty() ||
                           (TE->UserTreeIndex.UserTE &&
@@ -8098,25 +8106,26 @@ bool BoUpSLP::isProfitableToReorder() const {
                           TE->getOpcode() == Instruction::Load ||
                           TE->getOpcode() == Instruction::ZExt ||
                           TE->getOpcode() == Instruction::SExt))) &&
-                       (VectorizableTree.front()->getVectorFactor() > TinyVF ||
+                       (VectorizableTree.back().front()->getVectorFactor() >
+                            TinyVF ||
                         !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
                           return !isConstant(V) && isVectorized(V);
                         }));
-              })) >= VectorizableTree.size() - ReorderedSplitsCnt)
+              })) >= VectorizableTree.back().size() - ReorderedSplitsCnt)
         return false;
     }
     bool HasPhis = false;
     bool HasLoad = true;
     unsigned GatherLoads = 0;
     for (const std::unique_ptr<TreeEntry> &TE :
-         ArrayRef(VectorizableTree).drop_front()) {
+         ArrayRef(VectorizableTree.back()).drop_front()) {
       if (TE->State == TreeEntry::SplitVectorize)
         continue;
       if (!TE->hasState()) {
         if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
             all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
           continue;
-        if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+        if (VectorizableTree.back().front()->Scalars.size() == TinyVF &&
             any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
           continue;
         return true;
@@ -8140,7 +8149,7 @@ bool BoUpSLP::isProfitableToReorder() const {
            static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
                TE->Scalars.size() / 2))
         return true;
-      if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+      if (VectorizableTree.back().front()->Scalars.size() == TinyVF &&
           TE->getNumOperands() > PhiOpsLimit)
         return false;
       HasPhis = true;
@@ -8194,8 +8203,8 @@ void BoUpSLP::reorderTopToBottom() {
   // Find all reorderable nodes with the given VF.
   // Currently the are vectorized stores,loads,extracts + some gathering of
   // extracts.
-  for_each(VectorizableTree, [&, &TTIRef = *TTI](
-                                 const std::unique_ptr<TreeEntry> &TE) {
+  for_each(VectorizableTree.back(), [&, &TTIRef = *TTI](
+                                        const std::unique_ptr<TreeEntry> &TE) {
     // Look for external users that will probably be vectorized.
     SmallVector<OrdersType, 1> ExternalUserReorderIndices =
         findExternalStoreUsersReorderIndices(TE.get());
@@ -8225,9 +8234,10 @@ void BoUpSLP::reorderTopToBottom() {
     }
 
     bool IgnoreReorder =
-        !UserIgnoreList && VectorizableTree.front()->hasState() &&
-        (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
-         VectorizableTree.front()->getOpcode() == Instruction::Store);
+        !UserIgnoreList && VectorizableTree.back().front()->hasState() &&
+        (VectorizableTree.back().front()->getOpcode() ==
+             Instruction::InsertElement ||
+         VectorizableTree.back().front()->getOpcode() == Instruction::Store);
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
       // Do not include ordering for nodes used in the alt opcode vectorization,
@@ -8263,7 +8273,7 @@ void BoUpSLP::reorderTopToBottom() {
   });
 
   // Reorder the graph nodes according to their vectorization factor.
-  for (unsigned VF = VectorizableTree.front()->getVectorFactor();
+  for (unsigned VF = VectorizableTree.back().front()->getVectorFactor();
        !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
@@ -8387,7 +8397,7 @@ void BoUpSLP::reorderTopToBottom() {
       return I < E ? static_cast<int>(I) : PoisonMaskElem;
     });
     // Do an actual reordering, if profitable.
-    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
       // Just do the reordering for the nodes with the given VF.
       if (TE->Scalars.size() != VF) {
         if (TE->ReuseShuffleIndices.size() == VF) {
@@ -8530,7 +8540,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   // Currently the are vectorized loads,extracts without alternate operands +
   // some gathering of extracts.
   SmallPtrSet<const TreeEntry *, 4> NonVectorized;
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
     if (TE->State != TreeEntry::Vectorize &&
         TE->State != TreeEntry::StridedVectorize &&
         TE->State != TreeEntry::CompressVectorize &&
@@ -8594,7 +8604,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
                "Expected exactly 2 entries.");
         for (const auto &P : Data.first->CombinedEntriesWithIndices) {
-          TreeEntry &OpTE = *VectorizableTree[P.first];
+          TreeEntry &OpTE = *VectorizableTree.back()[P.first];
           OrdersType Order = OpTE.ReorderIndices;
           if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
             if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
@@ -8872,7 +8882,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
                 TE->ReorderIndices.empty()) &&
                "Non-matching sizes of user/operand entries.");
         reorderOrder(TE->ReorderIndices, Mask);
-        if (IgnoreReorder && TE == VectorizableTree.front().get())
+        if (IgnoreReorder && TE == VectorizableTree.back().front().get())
           IgnoreReorder = false;
       }
       // For gathers just need to reorder its scalars.
@@ -8919,9 +8929,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     }
   }
   // If the reordering is unnecessary, just remove the reorder.
-  if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
-      VectorizableTree.front()->ReuseShuffleIndices.empty())
-    VectorizableTree.front()->ReorderIndices.clear();
+  if (IgnoreReorder &&
+      !VectorizableTree.back().front()->ReorderIndices.empty() &&
+      VectorizableTree.back().front()->ReuseShuffleIndices.empty())
+    VectorizableTree.back().front()->ReorderIndices.clear();
 }
 
 Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
@@ -8939,7 +8950,7 @@ void BoUpSLP::buildExternalUses(
   const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
   DenseMap<Value *, unsigned> ScalarToExtUses;
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (auto &TEPtr : VectorizableTree.back()) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -9185,6 +9196,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   deleteTree();
   assert(TreeEntryToStridedPtrInfoMap.empty() &&
          "TreeEntryToStridedPtrInfoMap is not cleared");
+  VectorizableTree.emplace_back();
   UserIgnoreList = &UserIgnoreLst;
   if (!allSameType(Roots))
     return;
@@ -9195,6 +9207,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
   deleteTree();
   assert(TreeEntryToStridedPtrInfoMap.empty() &&
          "TreeEntryToStridedPtrInfoMap is not cleared");
+  VectorizableTree.emplace_back();
   if (!allSameType(Roots))
     return;
   buildTreeRec(Roots, 0, EdgeInfo());
@@ -9359,12 +9372,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
         std::tuple<BasicBlock *, Value *, Type *>,
         SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
         &GatheredLoads) {
-  GatheredLoadsEntriesFirst = VectorizableTree.size();
+  GatheredLoadsEntriesFirst = VectorizableTree.back().size();
 
   SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
       LoadEntriesToVectorize.size());
   for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
-    Set.insert_range(VectorizableTree[Idx]->Scalars);
+    Set.insert_range(VectorizableTree.back()[Idx]->Scalars);
 
   // Sort loads by distance.
   auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
@@ -9700,7 +9713,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                            if (It == Slice.end())
                              return false;
                            const TreeEntry &TE =
-                               *VectorizableTree[std::get<0>(P)];
+                               *VectorizableTree.back()[std::get<0>(P)];
                            ArrayRef<Value *> VL = TE.Scalars;
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
@@ -9746,14 +9759,14 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
                            [&](const auto &P) {
                              return !SubSlice.equals(
-                                        VectorizableTree[std::get<0>(P)]
+                                        VectorizableTree.back()[std::get<0>(P)]
                                             ->Scalars) &&
                                     set_is_subset(SubSlice, std::get<1>(P));
                            }))
                   continue;
-                unsigned Sz = VectorizableTree.size();
+                unsigned Sz = VectorizableTree.back().size();
                 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
-                if (Sz == VectorizableTree.size()) {
+                if (Sz == VectorizableTree.back().size()) {
                   IsVectorized = false;
                   // Try non-interleaved vectorization with smaller vector
                   // factor.
@@ -9797,7 +9810,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
   }
   // Try to vectorize postponed load entries, previously marked as gathered.
   for (unsigned Idx : LoadEntriesToVectorize) {
-    const TreeEntry &E = *VectorizableTree[Idx];
+    const TreeEntry &E = *VectorizableTree.back()[Idx];
     SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
     // Avoid reordering, if possible.
     if (!E.ReorderIndices.empty()) {
@@ -9812,7 +9825,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
   // If no new entries created, consider it as no gathered loads entries must be
   // handled.
   if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
-      VectorizableTree.size())
+      VectorizableTree.back().size())
     GatheredLoadsEntriesFirst.reset();
 }
 
@@ -10191,25 +10204,25 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
-      if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+      if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
                               : TreeEntry::CompressVectorize;
     case LoadsState::ScatterVectorize:
-      if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+      if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
                               : TreeEntry::ScatterVectorize;
     case LoadsState::StridedVectorize:
-      if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
+      if (!IsGraphTransformMode && VectorizableTree.back().size() > 1) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10843,8 +10856,10 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
         TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
     InstructionCost NewCost =
         NewVecOpsCost + InsertCost +
-        (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
-                 VectorizableTree.front()->getOpcode() == Instruction::Store
+        (!VectorizableTree.back().empty() &&
+                 VectorizableTree.back().front()->hasState() &&
+                 VectorizableTree.back().front()->getOpcode() ==
+                     Instruction::Store
              ? NewShuffleCost
              : 0);
     // If not profitable to split - exit.
@@ -11446,7 +11461,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
   auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
     if (!S || !S.isAltShuffle() || VL.size() > 2)
       return false;
-    if (VectorizableTree.size() < MinTreeSize)
+    if (VectorizableTree.back().size() < MinTreeSize)
       return false;
     if (Depth >= RecursionMaxDepth - 1)
       return true;
@@ -11606,12 +11621,12 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       if (S && (isa<LoadInst>(S.getMainOp()) ||
                 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
         // Build gather node for loads, they will be gathered later.
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
+        TE->CombinedEntriesWithIndices.emplace_back(
+            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
         (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
       } else {
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
+        TE->CombinedEntriesWithIndices.emplace_back(
+            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
         buildTreeRec(Op, Depth, {TE, Idx});
       }
     };
@@ -12811,7 +12826,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
     return;
 
   if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
-        return VectorizableTree[Idx]->isSame(TE.Scalars);
+        return VectorizableTree.back()[Idx]->isSame(TE.Scalars);
       }))
     return;
 
@@ -13045,7 +13060,7 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
 
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  BaseGraphSize = VectorizableTree.size();
+  BaseGraphSize = VectorizableTree.back().size();
   // Turn graph transforming mode on and off, when done.
   class GraphTransformModeRAAI {
     bool &SavedIsGraphTransformMode;
@@ -13083,7 +13098,7 @@ void BoUpSLP::transformNodes() {
 
   // Try to reorder gather nodes for better vectorization opportunities.
   for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VectorizableTree[Idx];
+    TreeEntry &E = *VectorizableTree.back()[Idx];
     if (E.isGather())
       reorderGatherNode(E);
   }
@@ -13092,11 +13107,12 @@ void BoUpSLP::transformNodes() {
   // gathered nodes each having less than 16 elements.
   constexpr unsigned VFLimit = 16;
   bool ForceLoadGather =
-      count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->isGather() && TE->hasState() &&
-               TE->getOpcode() == Instruction::Load &&
-               TE->getVectorFactor() < VFLimit;
-      }) == 2;
+      count_if(VectorizableTree.back(),
+               [&](const std::unique_ptr<TreeEntry> &TE) {
+                 return TE->isGather() && TE->hasState() &&
+                        TE->getOpcode() == Instruction::Load &&
+                        TE->getVectorFactor() < VFLimit;
+               }) == 2;
 
   // Checks if the scalars are used in other node.
   auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
@@ -13153,7 +13169,7 @@ void BoUpSLP::transformNodes() {
   };
   // The tree may grow here, so iterate over nodes, built before.
   for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VectorizableTree[Idx];
+    TreeEntry &E = *VectorizableTree.back()[Idx];
     if (E.isGather()) {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
@@ -13287,19 +13303,19 @@ void BoUpSLP::transformNodes() {
             // If any instruction is vectorized already - do not try again.
             SameTE = getSameValuesTreeEntry(*It, Slice);
           }
-          unsigned PrevSize = VectorizableTree.size();
+          unsigned PrevSize = VectorizableTree.back().size();
           [[maybe_unused]] unsigned PrevEntriesSize =
               LoadEntriesToVectorize.size();
           buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
-          if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
-              VectorizableTree[PrevSize]->isGather() &&
-              VectorizableTree[PrevSize]->hasState() &&
-              VectorizableTree[PrevSize]->getOpcode() !=
+          if (PrevSize + 1 == VectorizableTree.back().size() && !SameTE &&
+              VectorizableTree.back()[PrevSize]->isGather() &&
+              VectorizableTree.back()[PrevSize]->hasState() &&
+              VectorizableTree.back()[PrevSize]->getOpcode() !=
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
             if (UserIgnoreList && E.Idx == 0 && VF == 2)
               analyzedReductionVals(Slice);
-            VectorizableTree.pop_back();
+            VectorizableTree.back().pop_back();
             assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
                    "LoadEntriesToVectorize expected to remain the same");
             continue;
@@ -13452,21 +13468,23 @@ void BoUpSLP::transformNodes() {
 
   if (LoadEntriesToVectorize.empty()) {
     // Single load node - exit.
-    if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
-        VectorizableTree.front()->getOpcode() == Instruction::Load)
+    if (VectorizableTree.back().size() <= 1 &&
+        VectorizableTree.back().front()->hasState() &&
+        VectorizableTree.back().front()->getOpcode() == Instruction::Load)
       return;
     // Small graph with small VF - exit.
     constexpr unsigned SmallTree = 3;
     constexpr unsigned SmallVF = 2;
-    if ((VectorizableTree.size() <= SmallTree &&
-         VectorizableTree.front()->Scalars.size() == SmallVF) ||
-        (VectorizableTree.size() <= 2 && UserIgnoreList))
+    if ((VectorizableTree.back().size() <= SmallTree &&
+         VectorizableTree.back().front()->Scalars.size() == SmallVF) ||
+        (VectorizableTree.back().size() <= 2 && UserIgnoreList))
       return;
 
-    if (VectorizableTree.front()->isNonPowOf2Vec() &&
+    if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
         getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
         getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
+        count_if(ArrayRef(VectorizableTree.back())
+                     .drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
                    return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
@@ -13481,7 +13499,7 @@ void BoUpSLP::transformNodes() {
                  SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
       GatheredLoads;
 
-  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
         ((E.hasState() && E.getOpcode() == Instruction::Load) ||
@@ -14072,7 +14090,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Check if it can be considered reused if same extractelements were
     // vectorized already.
     bool PrevNodeFound = any_of(
-        ArrayRef(R.VectorizableTree).take_front(E->Idx),
+        ArrayRef(R.VectorizableTree.back()).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
           return ((TE->hasState() && !TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
@@ -14508,16 +14526,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       VectorCost = ::getShuffleCost(
           *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
           E->CombinedEntriesWithIndices.back().second,
-          getWidenedType(
-              ScalarTy,
-              VectorizableTree[E->CombinedEntriesWithIndices.back().first]
-                  ->getVectorFactor()));
+          getWidenedType(ScalarTy,
+                         VectorizableTree
+                             .back()[E->CombinedEntriesWithIndices.back().first]
+                             ->getVectorFactor()));
     } else {
-      unsigned CommonVF =
-          std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
-                       ->getVectorFactor(),
-                   VectorizableTree[E->CombinedEntriesWithIndices.back().first]
-                       ->getVectorFactor());
+      unsigned CommonVF = std::max(
+          VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first]
+              ->getVectorFactor(),
+          VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first]
+              ->getVectorFactor());
       VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
                                     getWidenedType(ScalarTy, CommonVF),
                                     E->getSplitMask(), CostKind);
@@ -15323,7 +15341,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     // Try to find the previous shuffle node with the same operands and same
     // main/alternate ops.
     auto TryFindNodeWithEqualOperands = [=]() {
-      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
         if (TE.get() == E)
           break;
         if (TE->hasState() && TE->isAltShuffle() &&
@@ -15482,7 +15500,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
 
 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
-                    << VectorizableTree.size() << " is fully vectorizable .\n");
+                    << VectorizableTree.back().size()
+                    << " is fully vectorizable .\n");
 
   auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
     SmallVector<int> Mask;
@@ -15501,34 +15520,34 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   };
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 &&
-      (VectorizableTree[0]->State == TreeEntry::Vectorize ||
-       VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
-       VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
+  if (VectorizableTree.back().size() == 1 &&
+      (VectorizableTree.back()[0]->State == TreeEntry::Vectorize ||
+       VectorizableTree.back()[0]->State == TreeEntry::StridedVectorize ||
+       VectorizableTree.back()[0]->State == TreeEntry::CompressVectorize ||
        (ForReduction &&
-        AreVectorizableGathers(VectorizableTree[0].get(),
-                               VectorizableTree[0]->Scalars.size()) &&
-        VectorizableTree[0]->getVectorFactor() > 2)))
+        AreVectorizableGathers(VectorizableTree.back()[0].get(),
+                               VectorizableTree.back()[0]->Scalars.size()) &&
+        VectorizableTree.back()[0]->getVectorFactor() > 2)))
     return true;
 
-  if (VectorizableTree.size() != 2)
+  if (VectorizableTree.back().size() != 2)
     return false;
 
   // Handle splat and all-constants stores. Also try to vectorize tiny trees
   // with the second gather nodes if they have less scalar operands rather than
   // the initial tree element (may be profitable to shuffle the second gather)
   // or they are extractelements, which form shuffle.
-  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
-      AreVectorizableGathers(VectorizableTree[1].get(),
-                             VectorizableTree[0]->Scalars.size()))
+  if (VectorizableTree.back()[0]->State == TreeEntry::Vectorize &&
+      AreVectorizableGathers(VectorizableTree.back()[1].get(),
+                             VectorizableTree.back()[0]->Scalars.size()))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->isGather() ||
-      (VectorizableTree[1]->isGather() &&
-       VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
-       VectorizableTree[0]->State != TreeEntry::CompressVectorize))
+  if (VectorizableTree.back()[0]->isGather() ||
+      (VectorizableTree.back()[1]->isGather() &&
+       VectorizableTree.back()[0]->State != TreeEntry::ScatterVectorize &&
+       VectorizableTree.back()[0]->State != TreeEntry::StridedVectorize &&
+       VectorizableTree.back()[0]->State != TreeEntry::CompressVectorize))
     return false;
 
   return true;
@@ -15578,8 +15597,8 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
   if (RdxKind != RecurKind::Or)
     return false;
 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  unsigned NumElts = VectorizableTree.back()[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree.back()[0]->Scalars[0];
   return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
                                     /* MatchOr */ false);
 }
@@ -15602,19 +15621,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return true;
 
   // Graph is empty - do nothing.
-  if (VectorizableTree.empty()) {
+  if (VectorizableTree.back().empty()) {
     assert(ExternalUses.empty() && "We shouldn't have any external users");
 
     return true;
   }
 
   // No need to vectorize inserts of gathered values.
-  if (VectorizableTree.size() == 2 &&
-      isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
-      VectorizableTree[1]->isGather() &&
-      (VectorizableTree[1]->getVectorFactor() <= 2 ||
-       !(isSplat(VectorizableTree[1]->Scalars) ||
-         allConstant(VectorizableTree[1]->Scalars))))
+  if (VectorizableTree.back().size() == 2 &&
+      isa<InsertElementInst>(VectorizableTree.back()[0]->Scalars[0]) &&
+      VectorizableTree.back()[1]->isGather() &&
+      (VectorizableTree.back()[1]->getVectorFactor() <= 2 ||
+       !(isSplat(VectorizableTree.back()[1]->Scalars) ||
+         allConstant(VectorizableTree.back()[1]->Scalars))))
     return true;
 
   // If the graph includes only PHI nodes and gathers, it is defnitely not
@@ -15623,8 +15642,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // gathers/buildvectors.
   constexpr int Limit = 4;
   if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
-      !VectorizableTree.empty() &&
-      all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+      !VectorizableTree.back().empty() &&
+      all_of(VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry>
+                                              &TE) {
         return (TE->isGather() &&
                 (!TE->hasState() ||
                  TE->getOpcode() != Instruction::ExtractElement) &&
@@ -15636,8 +15656,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // Do not vectorize small tree of phis only, if all vector phis are also
   // gathered.
   if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
-      VectorizableTree.size() <= Limit &&
-      all_of(VectorizableTree,
+      VectorizableTree.back().size() <= Limit &&
+      all_of(VectorizableTree.back(),
              [&](const std::unique_ptr<TreeEntry> &TE) {
                return (TE->isGather() &&
                        (!TE->hasState() ||
@@ -15651,10 +15671,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
                            return isa<PoisonValue>(V) || MustGather.contains(V);
                          }))));
              }) &&
-      any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->State == TreeEntry::Vectorize &&
-               TE->getOpcode() == Instruction::PHI;
-      }))
+      any_of(VectorizableTree.back(),
+             [&](const std::unique_ptr<TreeEntry> &TE) {
+               return TE->State == TreeEntry::Vectorize &&
+                      TE->getOpcode() == Instruction::PHI;
+             }))
     return true;
 
   // If the tree contains only phis, buildvectors, split nodes and
@@ -15663,7 +15684,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   unsigned NumGathers = 0;
   constexpr int LimitTreeSize = 36;
   if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
-      all_of(VectorizableTree,
+      all_of(VectorizableTree.back(),
              [&](const std::unique_ptr<TreeEntry> &TE) {
                if (!TE->isGather() && TE->hasState() &&
                    (TE->getOpcode() == Instruction::Load ||
@@ -15676,7 +15697,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
                return TE->State == TreeEntry::SplitVectorize ||
                       (TE->Idx == 0 && TE->Scalars.size() == 2 &&
                        TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
-                       VectorizableTree.size() > LimitTreeSize) ||
+                       VectorizableTree.back().size() > LimitTreeSize) ||
                       (TE->isGather() &&
                        none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
                       (TE->hasState() &&
@@ -15690,7 +15711,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
                          TE->Scalars.size() == 2)));
              }) &&
       (StoreLoadNodes.empty() ||
-       (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
+       (VectorizableTree.back().size() >
+            LimitTreeSize * StoreLoadNodes.size() &&
         (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
            return TE->getOpcode() == Instruction::Store ||
                   all_of(TE->Scalars, [&](Value *V) {
@@ -15703,9 +15725,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // If the tree contains only buildvector, 2 non-buildvectors (with root user
   // tree node) and other buildvectors, we can skip it.
   if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
-      VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
-      VectorizableTree.size() >= Limit &&
-      count_if(ArrayRef(VectorizableTree).drop_front(),
+      VectorizableTree.back().front()->State == TreeEntry::SplitVectorize &&
+      VectorizableTree.back().size() >= Limit &&
+      count_if(ArrayRef(VectorizableTree.back()).drop_front(),
                [&](const std::unique_ptr<TreeEntry> &TE) {
                  return !TE->isGather() && TE->UserTreeIndex.UserTE &&
                         TE->UserTreeIndex.UserTE->Idx == 0;
@@ -15715,19 +15737,20 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // If the tree contains only vectorization of the phi node from the
   // buildvector - skip it.
   if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
-      VectorizableTree.size() > 2 &&
-      VectorizableTree.front()->State == TreeEntry::Vectorize &&
-      VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
-      VectorizableTree[1]->State == TreeEntry::Vectorize &&
-      VectorizableTree[1]->getOpcode() == Instruction::PHI &&
+      VectorizableTree.back().size() > 2 &&
+      VectorizableTree.back().front()->State == TreeEntry::Vectorize &&
+      VectorizableTree.back().front()->getOpcode() ==
+          Instruction::InsertElement &&
+      VectorizableTree.back()[1]->State == TreeEntry::Vectorize &&
+      VectorizableTree.back()[1]->getOpcode() == Instruction::PHI &&
       all_of(
-          ArrayRef(VectorizableTree).drop_front(2),
+          ArrayRef(VectorizableTree.back()).drop_front(2),
           [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
     return true;
 
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
-  if (VectorizableTree.size() >= MinTreeSize)
+  if (VectorizableTree.back().size() >= MinTreeSize)
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -15738,13 +15761,16 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // Check if any of the gather node forms an insertelement buildvector
   // somewhere.
   bool IsAllowedSingleBVNode =
-      VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
-       !VectorizableTree.front()->isAltShuffle() &&
-       VectorizableTree.front()->getOpcode() != Instruction::PHI &&
-       VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
-       allSameBlock(VectorizableTree.front()->Scalars));
-  if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+      VectorizableTree.back().size() > 1 ||
+      (VectorizableTree.back().size() == 1 &&
+       VectorizableTree.back().front()->hasState() &&
+       !VectorizableTree.back().front()->isAltShuffle() &&
+       VectorizableTree.back().front()->getOpcode() != Instruction::PHI &&
+       VectorizableTree.back().front()->getOpcode() !=
+           Instruction::GetElementPtr &&
+       allSameBlock(VectorizableTree.back().front()->Scalars));
+  if (any_of(VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry>
+                                              &TE) {
         return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
                  return isa<ExtractElementInst, Constant>(V) ||
                         (IsAllowedSingleBVNode &&
@@ -15754,16 +15780,21 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       }))
     return false;
 
-  if (VectorizableTree.back()->isGather() &&
-      VectorizableTree.back()->hasState() &&
-      VectorizableTree.back()->isAltShuffle() &&
-      VectorizableTree.back()->getVectorFactor() > 2 &&
-      allSameBlock(VectorizableTree.back()->Scalars) &&
-      !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
+  if (VectorizableTree.back().back()->isGather() &&
+      VectorizableTree.back().back()->hasState() &&
+      VectorizableTree.back().back()->isAltShuffle() &&
+      VectorizableTree.back().back()->getVectorFactor() > 2 &&
+      allSameBlock(VectorizableTree.back().back()->Scalars) &&
+      !VectorizableTree.back()
+           .back()
+           ->Scalars.front()
+           ->getType()
+           ->isVectorTy() &&
       TTI->getScalarizationOverhead(
-          getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
-                         VectorizableTree.back()->getVectorFactor()),
-          APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
+          getWidenedType(
+              VectorizableTree.back().back()->Scalars.front()->getType(),
+              VectorizableTree.back().back()->getVectorFactor()),
+          APInt::getAllOnes(VectorizableTree.back().back()->getVectorFactor()),
           /*Insert=*/true, /*Extract=*/false,
           TTI::TCK_RecipThroughput) > -SLPCostThreshold)
     return false;
@@ -15776,9 +15807,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
 bool BoUpSLP::isTreeNotExtendable() const {
   if (getCanonicalGraphSize() != getTreeSize()) {
     constexpr unsigned SmallTree = 3;
-    if (VectorizableTree.front()->isNonPowOf2Vec() &&
+    if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
         getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
+        count_if(ArrayRef(VectorizableTree.back())
+                     .drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
                    return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
@@ -15789,7 +15821,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
   }
   bool Res = false;
   for (unsigned Idx : seq<unsigned>(getTreeSize())) {
-    TreeEntry &E = *VectorizableTree[Idx];
+    TreeEntry &E = *VectorizableTree.back()[Idx];
     if (E.State == TreeEntry::SplitVectorize)
       return false;
     if (!E.isGather())
@@ -15813,7 +15845,7 @@ InstructionCost BoUpSLP::getSpillCost() {
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
 
-  const TreeEntry *Root = VectorizableTree.front().get();
+  const TreeEntry *Root = VectorizableTree.back().front().get();
   if (Root->isGather())
     return 0;
 
@@ -15822,7 +15854,7 @@ InstructionCost BoUpSLP::getSpillCost() {
       EntriesToOperands;
   SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
   SmallPtrSet<const Instruction *, 8> LastInstructions;
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const auto &TEPtr : VectorizableTree.back()) {
     if (!TEPtr->isGather()) {
       Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
       EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
@@ -15853,7 +15885,7 @@ InstructionCost BoUpSLP::getSpillCost() {
       CheckedInstructions;
   unsigned Budget = 0;
   const unsigned BudgetLimit =
-      ScheduleRegionSizeBudget / VectorizableTree.size();
+      ScheduleRegionSizeBudget / VectorizableTree.back().size();
   auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
                                             const Instruction *Last) {
     assert(First->getParent() == Last->getParent() &&
@@ -16204,11 +16236,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
                                      InstructionCost ReductionCost) {
   InstructionCost Cost = ReductionCost;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+                    << VectorizableTree.back().size() << ".\n");
 
   SmallPtrSet<Value *, 4> CheckedExtracts;
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I];
+  for (unsigned I = 0, E = VectorizableTree.back().size(); I < E; ++I) {
+    TreeEntry &TE = *VectorizableTree.back()[I];
     // No need to count the cost for combined entries, they are combined and
     // just skip their cost.
     if (TE.State == TreeEntry::CombinedVectorize) {
@@ -16464,21 +16496,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
         // block as the root phis, currently vectorized. It allows to keep
         // better ordering info of PHIs, being vectorized currently.
         bool IsProfitablePHIUser =
-            (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
-                            VectorizableTree.front()->Scalars.size() > 2)) &&
-            VectorizableTree.front()->hasState() &&
-            VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+            (KeepScalar ||
+             (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+              VectorizableTree.back().front()->Scalars.size() > 2)) &&
+            VectorizableTree.back().front()->hasState() &&
+            VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
             !Inst->hasNUsesOrMore(UsesLimit) &&
-            none_of(Inst->users(),
-                    [&](User *U) {
-                      auto *PHIUser = dyn_cast<PHINode>(U);
-                      return (!PHIUser ||
-                              PHIUser->getParent() !=
-                                  cast<Instruction>(
-                                      VectorizableTree.front()->getMainOp())
-                                      ->getParent()) &&
-                             !isVectorized(U);
-                    }) &&
+            none_of(
+                Inst->users(),
+                [&](User *U) {
+                  auto *PHIUser = dyn_cast<PHINode>(U);
+                  return (!PHIUser ||
+                          PHIUser->getParent() !=
+                              cast<Instruction>(
+                                  VectorizableTree.back().front()->getMainOp())
+                                  ->getParent()) &&
+                         !isVectorized(U);
+                }) &&
             count_if(Entry->Scalars, [&](Value *V) {
               return ValueToExtUses->contains(V);
             }) <= 2;
@@ -16546,7 +16580,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   }
   // Add reduced value cost, if resized.
   if (!VectorizedVals.empty()) {
-    const TreeEntry &Root = *VectorizableTree.front();
+    const TreeEntry &Root = *VectorizableTree.back().front();
     auto BWIt = MinBWs.find(&Root);
     if (BWIt != MinBWs.end()) {
       Type *DstTy = Root.Scalars.front()->getType();
@@ -16679,7 +16713,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   // Add the cost for reduced value resize (if required).
   if (ReductionBitWidth != 0) {
     assert(UserIgnoreList && "Expected reduction tree.");
-    const TreeEntry &E = *VectorizableTree.front();
+    const TreeEntry &E = *VectorizableTree.back().front();
     auto It = MinBWs.find(&E);
     if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
       unsigned SrcSize = It->second.first;
@@ -16898,7 +16932,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   auto GetUserEntry = [&](const TreeEntry *TE) {
     while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
       TE = TE->UserTreeIndex.UserTE;
-    if (TE == VectorizableTree.front().get())
+    if (TE == VectorizableTree.back().front().get())
       return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
     return TE->UserTreeIndex;
   };
@@ -17556,9 +17590,9 @@ BoUpSLP::isGatherShuffledEntry(
          "Expected positive number of registers.");
   Entries.clear();
   // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get() &&
+  if (TE == VectorizableTree.back().front().get() &&
       (!GatheredLoadsEntriesFirst.has_value() ||
-       none_of(ArrayRef(VectorizableTree).drop_front(),
+       none_of(ArrayRef(VectorizableTree.back()).drop_front(),
                [](const std::unique_ptr<TreeEntry> &TE) {
                  return !TE->isGather();
                })))
@@ -17568,7 +17602,7 @@ BoUpSLP::isGatherShuffledEntry(
   if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
-  assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
+  assert((TE->UserTreeIndex || TE == VectorizableTree.back().front().get()) &&
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
@@ -18073,9 +18107,10 @@ Value *BoUpSLP::gather(
       Vec = CreateShuffle(Root, Vec, Mask);
       if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
           OI && OI->use_empty() &&
-          none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-            return TE->VectorizedValue == OI;
-          }))
+          none_of(VectorizableTree.back(),
+                  [&](const std::unique_ptr<TreeEntry> &TE) {
+                    return TE->VectorizedValue == OI;
+                  }))
         eraseInstruction(OI);
     }
   }
@@ -18300,7 +18335,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
                    (isa<GetElementPtrInst>(U) &&
                     !R.areAllUsersVectorized(cast<Instruction>(U))) ||
                    (!UTEs.empty() &&
-                    count_if(R.VectorizableTree,
+                    count_if(R.VectorizableTree.back(),
                              [&](const std::unique_ptr<TreeEntry> &TE) {
                                return TE->UserTreeIndex.UserTE ==
                                           UTEs.front() &&
@@ -18669,14 +18704,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   // Clear values, to be replaced by insertvector instructions.
   for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
     for_each(MutableArrayRef(GatheredScalars)
-                 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
+                 .slice(Idx, VectorizableTree.back()[EIdx]->getVectorFactor()),
              [&](Value *&V) { V = PoisonValue::get(V->getType()); });
   SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
       E->CombinedEntriesWithIndices.size());
-  transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
-            [&](const auto &P) {
-              return std::make_pair(VectorizableTree[P.first].get(), P.second);
-            });
+  transform(
+      E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
+        return std::make_pair(VectorizableTree.back()[P.first].get(), P.second);
+      });
   // Build a mask out of the reorder indices and reorder scalars per this
   // mask.
   SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
@@ -18707,12 +18742,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
     if (UserTE->getNumOperands() != 2)
       return false;
     if (!IsNotPoisonous) {
-      auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
-                         [=](const std::unique_ptr<TreeEntry> &TE) {
-                           return TE->UserTreeIndex.UserTE == UserTE &&
-                                  TE->UserTreeIndex.EdgeIdx != EdgeIdx;
-                         });
-      if (It == VectorizableTree.end())
+      auto *It =
+          find_if(ArrayRef(VectorizableTree.back()).drop_front(UserTE->Idx + 1),
+                  [=](const std::unique_ptr<TreeEntry> &TE) {
+                    return TE->UserTreeIndex.UserTE == UserTE &&
+                           TE->UserTreeIndex.EdgeIdx != EdgeIdx;
+                  });
+      if (It == VectorizableTree.back().end())
         return false;
       SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
       if (!(*It)->ReorderIndices.empty()) {
@@ -19208,7 +19244,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
 
 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
   for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
-    (void)vectorizeTree(VectorizableTree[EIdx].get());
+    (void)vectorizeTree(VectorizableTree.back()[EIdx].get());
   return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
                                                                 Builder, *this);
 }
@@ -19259,13 +19295,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
            "Expected exactly 2 combined entries.");
     setInsertPointAfterBundle(E);
     TreeEntry &OpTE1 =
-        *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
+        *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first];
     assert(OpTE1.isSame(
                ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
            "Expected same first part of scalars.");
     Value *Op1 = vectorizeTree(&OpTE1);
     TreeEntry &OpTE2 =
-        *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
+        *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first];
     assert(
         OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
         "Expected same second part of scalars.");
@@ -19358,10 +19394,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
         E->CombinedEntriesWithIndices.size());
-    transform(
-        E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
-          return std::make_pair(VectorizableTree[P.first].get(), P.second);
-        });
+    transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
+              [&](const auto &P) {
+                return std::make_pair(VectorizableTree.back()[P.first].get(),
+                                      P.second);
+              });
     assert(
         (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
         "Expected either combined subnodes or reordering");
@@ -19389,7 +19426,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
-              E != VectorizableTree.front().get() || E->UserTreeIndex) &&
+              E != VectorizableTree.back().front().get() || E->UserTreeIndex) &&
              "PHI reordering is free.");
       auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent(),
@@ -20321,7 +20358,7 @@ Value *BoUpSLP::vectorizeTree(
     scheduleBlock(*this, BSIter.second.get());
   // Cache last instructions for the nodes to avoid side effects, which may
   // appear during vectorization, like extra uses, etc.
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
     if (TE->isGather())
       continue;
     (void)getLastInstructionInBundle(TE.get());
@@ -20335,7 +20372,7 @@ Value *BoUpSLP::vectorizeTree(
 
   // Vectorize gather operands of the nodes with the external uses only.
   SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
     if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
         TE->UserTreeIndex.UserTE->hasState() &&
         TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
@@ -20357,7 +20394,7 @@ Value *BoUpSLP::vectorizeTree(
   }
   // Emit gathered loads first to emit better code for the users of those
   // gathered loads.
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
     if (GatheredLoadsEntriesFirst.has_value() &&
         TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
         (!TE->isGather() || TE->UserTreeIndex)) {
@@ -20367,7 +20404,7 @@ Value *BoUpSLP::vectorizeTree(
       (void)vectorizeTree(TE.get());
     }
   }
-  (void)vectorizeTree(VectorizableTree[0].get());
+  (void)vectorizeTree(VectorizableTree.back()[0].get());
   // Run through the list of postponed gathers and emit them, replacing the temp
   // emitted allocas with actual vector instructions.
   ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -20867,7 +20904,7 @@ Value *BoUpSLP::vectorizeTree(
 
   SmallVector<Instruction *> RemovedInsts;
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (auto &TEPtr : VectorizableTree.back()) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -20911,7 +20948,8 @@ Value *BoUpSLP::vectorizeTree(
 
   // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
   // new vector instruction.
-  if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
+  if (auto *V =
+          dyn_cast<Instruction>(VectorizableTree.back()[0]->VectorizedValue))
     V->mergeDIAssignID(RemovedInsts);
 
   // Clear up reduction references, if any.
@@ -20919,20 +20957,22 @@ Value *BoUpSLP::vectorizeTree(
     for (Instruction *I : RemovedInsts) {
       const TreeEntry *IE = getTreeEntries(I).front();
       if (IE->Idx != 0 &&
-          !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
+          !(VectorizableTree.back().front()->isGather() && IE->UserTreeIndex &&
             (ValueToGatherNodes.lookup(I).contains(
-                 VectorizableTree.front().get()) ||
-             (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
+                 VectorizableTree.back().front().get()) ||
+             (IE->UserTreeIndex.UserTE ==
+                  VectorizableTree.back().front().get() &&
               IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
-          !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
+          !(VectorizableTree.back().front()->State ==
+                TreeEntry::SplitVectorize &&
             IE->UserTreeIndex &&
-            is_contained(VectorizableTree.front()->Scalars, I)) &&
+            is_contained(VectorizableTree.back().front()->Scalars, I)) &&
           !(GatheredLoadsEntriesFirst.has_value() &&
             IE->Idx >= *GatheredLoadsEntriesFirst &&
-            VectorizableTree.front()->isGather() &&
-            is_contained(VectorizableTree.front()->Scalars, I)) &&
-          !(!VectorizableTree.front()->isGather() &&
-            VectorizableTree.front()->isCopyableElement(I)))
+            VectorizableTree.back().front()->isGather() &&
+            is_contained(VectorizableTree.back().front()->Scalars, I)) &&
+          !(!VectorizableTree.back().front()->isGather() &&
+            VectorizableTree.back().front()->isCopyableElement(I)))
         continue;
       SmallVector<SelectInst *> LogicalOpSelects;
       I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
@@ -20962,7 +21002,7 @@ Value *BoUpSLP::vectorizeTree(
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  const TreeEntry &RootTE = *VectorizableTree.front();
+  const TreeEntry &RootTE = *VectorizableTree.back().front();
   Value *Vec = RootTE.VectorizedValue;
   if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
                                       It != MinBWs.end() &&
@@ -22400,8 +22440,10 @@ bool BoUpSLP::collectValuesToDemote(
   if (E.State == TreeEntry::SplitVectorize)
     return TryProcessInstruction(
         BitWidth,
-        {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
-         VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
+        {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().first]
+             .get(),
+         VectorizableTree.back()[E.CombinedEntriesWithIndices.back().first]
+             .get()});
 
   if (E.isAltShuffle()) {
     // Combining these opcodes may lead to incorrect analysis, skip for now.
@@ -22644,9 +22686,10 @@ static RecurKind getRdxKind(Value *V);
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
   bool IsStoreOrInsertElt =
-      VectorizableTree.front()->hasState() &&
-      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
-       VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
+      VectorizableTree.back().front()->hasState() &&
+      (VectorizableTree.back().front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.back().front()->getOpcode() ==
+           Instruction::InsertElement);
   if ((IsStoreOrInsertElt || UserIgnoreList) &&
       ExtraBitWidthNodes.size() <= 1 &&
       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@@ -22654,12 +22697,12 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   unsigned NodeIdx = 0;
-  if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
+  if (IsStoreOrInsertElt && !VectorizableTree.back().front()->isGather())
     NodeIdx = 1;
 
   // Ensure the roots of the vectorizable tree don't form a cycle.
-  assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
-          !VectorizableTree[NodeIdx]->UserTreeIndex) &&
+  assert((VectorizableTree.back()[NodeIdx]->isGather() || NodeIdx != 0 ||
+          !VectorizableTree.back()[NodeIdx]->UserTreeIndex) &&
          "Unexpected tree is graph.");
 
   // The first value node for store/insertelement is sext/zext/trunc? Skip it,
@@ -22669,8 +22712,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   SmallVector<unsigned> RootDemotes;
   SmallDenseSet<unsigned, 8> NodesToKeepBWs;
   if (NodeIdx != 0 &&
-      VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
-      VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+      VectorizableTree.back()[NodeIdx]->State == TreeEntry::Vectorize &&
+      VectorizableTree.back()[NodeIdx]->getOpcode() == Instruction::Trunc) {
     assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
     IsTruncRoot = true;
     RootDemotes.push_back(NodeIdx);
@@ -22679,7 +22722,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   }
 
   // Analyzed the reduction already and not profitable - exit.
-  if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
+  if (AnalyzedMinBWVals.contains(
+          VectorizableTree.back()[NodeIdx]->Scalars.front()))
     return;
 
   SmallVector<unsigned> ToDemote;
@@ -22859,7 +22903,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   // modify.
   // Add reduction ops sizes, if any.
   if (UserIgnoreList &&
-      isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
+      isa<IntegerType>(
+          VectorizableTree.back().front()->Scalars.front()->getType())) {
     // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
     // x i1> to in)).
     if (all_of(*UserIgnoreList,
@@ -22867,10 +22912,10 @@ void BoUpSLP::computeMinimumValueSizes() {
                  return isa<PoisonValue>(V) ||
                         cast<Instruction>(V)->getOpcode() == Instruction::Add;
                }) &&
-        VectorizableTree.front()->State == TreeEntry::Vectorize &&
-        VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
-        cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
-            Builder.getInt1Ty()) {
+        VectorizableTree.back().front()->State == TreeEntry::Vectorize &&
+        VectorizableTree.back().front()->getOpcode() == Instruction::ZExt &&
+        cast<CastInst>(VectorizableTree.back().front()->getMainOp())
+                ->getSrcTy() == Builder.getInt1Ty()) {
       ReductionBitWidth = 1;
     } else {
       for (Value *V : *UserIgnoreList) {
@@ -22896,9 +22941,9 @@ void BoUpSLP::computeMinimumValueSizes() {
     }
   }
   bool IsTopRoot = NodeIdx == 0;
-  while (NodeIdx < VectorizableTree.size() &&
-         VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
-         VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+  while (NodeIdx < VectorizableTree.back().size() &&
+         VectorizableTree.back()[NodeIdx]->State == TreeEntry::Vectorize &&
+         VectorizableTree.back()[NodeIdx]->getOpcode() == Instruction::Trunc) {
     RootDemotes.push_back(NodeIdx);
     ++NodeIdx;
     IsTruncRoot = true;
@@ -22909,17 +22954,17 @@ void BoUpSLP::computeMinimumValueSizes() {
              match_fn(m_CombineOr(m_SMin(m_Value(), m_Value()),
                                   m_SMax(m_Value(), m_Value())))))
     IsSignedCmp = true;
-  while (NodeIdx < VectorizableTree.size()) {
-    ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
+  while (NodeIdx < VectorizableTree.back().size()) {
+    ArrayRef<Value *> TreeRoot = VectorizableTree.back()[NodeIdx]->Scalars;
     unsigned Limit = 2;
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
-                VectorizableTree.front()->Scalars.front()->getType()))
+                VectorizableTree.back().front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
-        IsTruncRoot, IsSignedCmp);
+        *VectorizableTree.back()[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot,
+        Limit, IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -22928,7 +22973,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     }
 
     for (unsigned Idx : RootDemotes) {
-      if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
+      if (all_of(VectorizableTree.back()[Idx]->Scalars, [&](Value *V) {
             uint32_t OrigBitWidth =
                 DL->getTypeSizeInBits(V->getType()->getScalarType());
             if (OrigBitWidth > MaxBitWidth) {
@@ -22944,7 +22989,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     IsProfitableToDemoteRoot = true;
 
     if (ExtraBitWidthNodes.empty()) {
-      NodeIdx = VectorizableTree.size();
+      NodeIdx = VectorizableTree.back().size();
     } else {
       unsigned NewIdx = 0;
       do {
@@ -22953,21 +22998,22 @@ void BoUpSLP::computeMinimumValueSizes() {
       } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
       NodeIdx = NewIdx;
       IsTruncRoot =
-          NodeIdx < VectorizableTree.size() &&
-          VectorizableTree[NodeIdx]->UserTreeIndex &&
-          VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
-          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
-          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+          NodeIdx < VectorizableTree.back().size() &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
               Instruction::Trunc &&
-          !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
+          !VectorizableTree.back()[NodeIdx]
+               ->UserTreeIndex.UserTE->isAltShuffle();
       IsSignedCmp =
-          NodeIdx < VectorizableTree.size() &&
-          VectorizableTree[NodeIdx]->UserTreeIndex &&
-          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
-          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+          NodeIdx < VectorizableTree.back().size() &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+          VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
               Instruction::ICmp &&
           any_of(
-              VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
+              VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->Scalars,
               [&](Value *V) {
                 auto *IC = dyn_cast<ICmpInst>(V);
                 return IC && (IC->isSigned() ||
@@ -22993,7 +23039,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     // Finally, map the values we can demote to the maximum bit with we
     // computed.
     for (unsigned Idx : ToDemote) {
-      TreeEntry *TE = VectorizableTree[Idx].get();
+      TreeEntry *TE = VectorizableTree.back()[Idx].get();
       if (MinBWs.contains(TE))
         continue;
       bool IsSigned = any_of(TE->Scalars, [&](Value *R) {

>From aeff03e228375cd95b0b6490bb5cfa076ce091e9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 15 Dec 2025 13:09:43 -0800
Subject: [PATCH 05/19] [SLP][NFC] Iterate over all VectorizableTrees in
 BoUpSLP::vectorizeTree()

No change for now since vectorizeTree() is called after every tree is created,
 but will have an effect once we start storing more trees.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 231 ++++++++++--------
 1 file changed, 124 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ee4aa787f4e67..e1275ce4d434b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20358,11 +20358,12 @@ Value *BoUpSLP::vectorizeTree(
     scheduleBlock(*this, BSIter.second.get());
   // Cache last instructions for the nodes to avoid side effects, which may
   // appear during vectorization, like extra uses, etc.
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
-    if (TE->isGather())
-      continue;
-    (void)getLastInstructionInBundle(TE.get());
-  }
+  for (auto &VT : VectorizableTree)
+    for (const std::unique_ptr<TreeEntry> &TE : VT) {
+      if (TE->isGather())
+        continue;
+      (void)getLastInstructionInBundle(TE.get());
+    }
 
   if (ReductionRoot)
     Builder.SetInsertPoint(ReductionRoot->getParent(),
@@ -20372,20 +20373,21 @@ Value *BoUpSLP::vectorizeTree(
 
   // Vectorize gather operands of the nodes with the external uses only.
   SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
-    if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
-        TE->UserTreeIndex.UserTE->hasState() &&
-        TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
-        (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
-         TE->UserTreeIndex.UserTE->isAltShuffle()) &&
-        !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
-        all_of(TE->UserTreeIndex.UserTE->Scalars,
-               [](Value *V) { return isUsedOutsideBlock(V); })) {
-      Instruction &LastInst =
-          getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
-      GatherEntries.emplace_back(TE.get(), &LastInst);
+  for (auto &VT : VectorizableTree)
+    for (const std::unique_ptr<TreeEntry> &TE : VT) {
+      if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
+          TE->UserTreeIndex.UserTE->hasState() &&
+          TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+          (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
+           TE->UserTreeIndex.UserTE->isAltShuffle()) &&
+          !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
+          all_of(TE->UserTreeIndex.UserTE->Scalars,
+                 [](Value *V) { return isUsedOutsideBlock(V); })) {
+        Instruction &LastInst =
+            getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
+        GatherEntries.emplace_back(TE.get(), &LastInst);
+      }
     }
-  }
   for (auto &Entry : GatherEntries) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.SetInsertPoint(Entry.second);
@@ -20404,7 +20406,8 @@ Value *BoUpSLP::vectorizeTree(
       (void)vectorizeTree(TE.get());
     }
   }
-  (void)vectorizeTree(VectorizableTree.back()[0].get());
+  for (auto &VT : VectorizableTree)
+    (void)vectorizeTree(VT[0].get());
   // Run through the list of postponed gathers and emit them, replacing the temp
   // emitted allocas with actual vector instructions.
   ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -20902,119 +20905,133 @@ Value *BoUpSLP::vectorizeTree(
     CSEBlocks.insert(LastInsert->getParent());
   }
 
-  SmallVector<Instruction *> RemovedInsts;
+  SmallVector<SmallVector<Instruction *>> RemovedInsts;
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree.back()) {
-    TreeEntry *Entry = TEPtr.get();
+  for (auto &VT : VectorizableTree) {
+    RemovedInsts.emplace_back();
+    for (auto &TEPtr : VT) {
+      TreeEntry *Entry = TEPtr.get();
 
-    // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
-      continue;
+      // No need to handle users of gathered values.
+      if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+        continue;
 
-    assert(Entry->VectorizedValue && "Can't find vectorizable value");
+      assert(Entry->VectorizedValue && "Can't find vectorizable value");
 
-    // For each lane:
-    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
+      // For each lane:
+      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+        Value *Scalar = Entry->Scalars[Lane];
 
-      if (Entry->getOpcode() == Instruction::GetElementPtr &&
-          !isa<GetElementPtrInst>(Scalar))
-        continue;
-      if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
-          EE && IgnoredExtracts.contains(EE))
-        continue;
-      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
-        continue;
+        if (Entry->getOpcode() == Instruction::GetElementPtr &&
+            !isa<GetElementPtrInst>(Scalar))
+          continue;
+        if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+            EE && IgnoredExtracts.contains(EE))
+          continue;
+        if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
+          continue;
 #ifndef NDEBUG
-      Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
-        for (User *U : Scalar->users()) {
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
-          // It is legal to delete users in the ignorelist.
-          assert((isVectorized(U) ||
-                  (UserIgnoreList && UserIgnoreList->contains(U)) ||
-                  (isa_and_nonnull<Instruction>(U) &&
-                   isDeleted(cast<Instruction>(U)))) &&
-                 "Deleting out-of-tree value");
+        Type *Ty = Scalar->getType();
+        if (!Ty->isVoidTy()) {
+          for (User *U : Scalar->users()) {
+            LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+            // It is legal to delete users in the ignorelist.
+            assert((isVectorized(U) ||
+                    (UserIgnoreList && UserIgnoreList->contains(U)) ||
+                    (isa_and_nonnull<Instruction>(U) &&
+                     isDeleted(cast<Instruction>(U)))) &&
+                   "Deleting out-of-tree value");
+          }
         }
-      }
 #endif
-      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
-      auto *I = cast<Instruction>(Scalar);
-      RemovedInsts.push_back(I);
+        LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+        auto *I = cast<Instruction>(Scalar);
+        RemovedInsts.back().push_back(I);
+      }
     }
   }
 
   // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
   // new vector instruction.
-  if (auto *V =
-          dyn_cast<Instruction>(VectorizableTree.back()[0]->VectorizedValue))
-    V->mergeDIAssignID(RemovedInsts);
+  for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+    if (auto *V =
+            dyn_cast<Instruction>(VectorizableTree[Idx][0]->VectorizedValue))
+      V->mergeDIAssignID(RemovedInsts[Idx]);
 
   // Clear up reduction references, if any.
   if (UserIgnoreList) {
-    for (Instruction *I : RemovedInsts) {
-      const TreeEntry *IE = getTreeEntries(I).front();
-      if (IE->Idx != 0 &&
-          !(VectorizableTree.back().front()->isGather() && IE->UserTreeIndex &&
-            (ValueToGatherNodes.lookup(I).contains(
-                 VectorizableTree.back().front().get()) ||
-             (IE->UserTreeIndex.UserTE ==
-                  VectorizableTree.back().front().get() &&
-              IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
-          !(VectorizableTree.back().front()->State ==
-                TreeEntry::SplitVectorize &&
-            IE->UserTreeIndex &&
-            is_contained(VectorizableTree.back().front()->Scalars, I)) &&
-          !(GatheredLoadsEntriesFirst.has_value() &&
-            IE->Idx >= *GatheredLoadsEntriesFirst &&
-            VectorizableTree.back().front()->isGather() &&
-            is_contained(VectorizableTree.back().front()->Scalars, I)) &&
-          !(!VectorizableTree.back().front()->isGather() &&
-            VectorizableTree.back().front()->isCopyableElement(I)))
-        continue;
-      SmallVector<SelectInst *> LogicalOpSelects;
-      I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
-        // Do not replace condition of the logical op in form select <cond>.
-        bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
-                                    (match(U.getUser(), m_LogicalAnd()) ||
-                                     match(U.getUser(), m_LogicalOr())) &&
-                                    U.getOperandNo() == 0;
-        if (IsPoisoningLogicalOp) {
-          LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
-          return false;
-        }
-        return UserIgnoreList->contains(U.getUser());
-      });
-      // Replace conditions of the poisoning logical ops with the non-poison
-      // constant value.
-      for (SelectInst *SI : LogicalOpSelects)
-        SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
-    }
+    for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+      for (Instruction *I : RemovedInsts[Idx]) {
+        const TreeEntry *IE = getTreeEntries(I).front();
+        if (IE->Idx != 0 &&
+            !(VectorizableTree[Idx].front()->isGather() && IE->UserTreeIndex &&
+              (ValueToGatherNodes.lookup(I).contains(
+                   VectorizableTree[Idx].front().get()) ||
+               (IE->UserTreeIndex.UserTE ==
+                    VectorizableTree[Idx].front().get() &&
+                IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
+            !(VectorizableTree[Idx].front()->State ==
+                  TreeEntry::SplitVectorize &&
+              IE->UserTreeIndex &&
+              is_contained(VectorizableTree[Idx].front()->Scalars, I)) &&
+            !(GatheredLoadsEntriesFirst.has_value() &&
+              IE->Idx >= *GatheredLoadsEntriesFirst &&
+              VectorizableTree[Idx].front()->isGather() &&
+              is_contained(VectorizableTree[Idx].front()->Scalars, I)) &&
+            !(!VectorizableTree[Idx].front()->isGather() &&
+              VectorizableTree[Idx].front()->isCopyableElement(I)))
+          continue;
+        SmallVector<SelectInst *> LogicalOpSelects;
+        I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
+          // Do not replace condition of the logical op in form select <cond>.
+          bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
+                                      (match(U.getUser(), m_LogicalAnd()) ||
+                                       match(U.getUser(), m_LogicalOr())) &&
+                                      U.getOperandNo() == 0;
+          if (IsPoisoningLogicalOp) {
+            LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
+            return false;
+          }
+          return UserIgnoreList->contains(U.getUser());
+        });
+        // Replace conditions of the poisoning logical ops with the non-poison
+        // constant value.
+        for (SelectInst *SI : LogicalOpSelects)
+          SI->setCondition(
+              Constant::getNullValue(SI->getCondition()->getType()));
+      }
   }
   // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
   // cache correctness.
   // NOTE: removeInstructionAndOperands only marks the instruction for deletion
   // - instructions are not deleted until later.
-  removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
+  SmallVector<Instruction *> AllRemovedInsts;
+  for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+    AllRemovedInsts.insert(AllRemovedInsts.begin(), RemovedInsts[Idx].begin(),
+                           RemovedInsts[Idx].end());
+  removeInstructionsAndOperands(ArrayRef(AllRemovedInsts),
+                                VectorValuesAndScales);
 
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  const TreeEntry &RootTE = *VectorizableTree.back().front();
-  Value *Vec = RootTE.VectorizedValue;
-  if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
-                                      It != MinBWs.end() &&
-                                      ReductionBitWidth != It->second.first) {
-    IRBuilder<>::InsertPointGuard Guard(Builder);
-    Builder.SetInsertPoint(ReductionRoot->getParent(),
-                           ReductionRoot->getIterator());
-    Vec = Builder.CreateIntCast(
-        Vec,
-        VectorType::get(Builder.getIntNTy(ReductionBitWidth),
-                        cast<VectorType>(Vec->getType())->getElementCount()),
-        It->second.second);
+  Value *Vec = nullptr;
+  for (auto &VT : VectorizableTree) {
+    const TreeEntry &RootTE = *VT.front();
+    Vec = RootTE.VectorizedValue;
+    if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
+                                        It != MinBWs.end() &&
+                                        ReductionBitWidth != It->second.first) {
+      IRBuilder<>::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(ReductionRoot->getParent(),
+                             ReductionRoot->getIterator());
+      Vec = Builder.CreateIntCast(
+          Vec,
+          VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+                          cast<VectorType>(Vec->getType())->getElementCount()),
+          It->second.second);
+    }
   }
   return Vec;
 }

>From 18b0c1589673b8c6c5adceb0389ac5c143551fda Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 15 Dec 2025 15:57:46 -0800
Subject: [PATCH 06/19] [SLP][NFC] Update CombinedEntriesWithIndices to hold
 new CombineIndex type

Without this struct, would be more confusing when the Tree index is included
along with this data.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 69 ++++++++++---------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1275ce4d434b..edb3188ad7917 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3906,13 +3906,12 @@ class slpvectorizer::BoUpSLP {
              "Expected only split vectorize node.");
       SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
       unsigned CommonVF = std::max<unsigned>(
-          CombinedEntriesWithIndices.back().second,
-          Scalars.size() - CombinedEntriesWithIndices.back().second);
+          CombinedEntriesWithIndices.back().Cnt,
+          Scalars.size() - CombinedEntriesWithIndices.back().Cnt);
       for (auto [Idx, I] : enumerate(ReorderIndices))
-        Mask[I] =
-            Idx + (Idx >= CombinedEntriesWithIndices.back().second
-                       ? CommonVF - CombinedEntriesWithIndices.back().second
-                       : 0);
+        Mask[I] = Idx + (Idx >= CombinedEntriesWithIndices.back().Cnt
+                             ? CommonVF - CombinedEntriesWithIndices.back().Cnt
+                             : 0);
       return Mask;
     }
 
@@ -4037,9 +4036,15 @@ class slpvectorizer::BoUpSLP {
     /// The index of this treeEntry in VectorizableTree.
     unsigned Idx = 0;
 
+    struct CombineIndex {
+      unsigned Idx;
+      unsigned Cnt;
+      CombineIndex(unsigned Idx, unsigned Cnt) : Idx(Idx), Cnt(Cnt) {}
+    };
+
     /// For gather/buildvector/alt opcode nodes, which are combined from
     /// other nodes as a series of insertvector instructions.
-    SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
+    SmallVector<CombineIndex, 2> CombinedEntriesWithIndices;
 
   private:
     /// The operands of each instruction in each lane Operands[op_index][lane].
@@ -4301,7 +4306,7 @@ class slpvectorizer::BoUpSLP {
       if (!CombinedEntriesWithIndices.empty()) {
         dbgs() << "Combined entries: ";
         interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
-          dbgs() << "Entry index " << P.first << " with offset " << P.second;
+          dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt;
         });
         dbgs() << "\n";
       }
@@ -8171,7 +8176,7 @@ void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
     copy(MaskOrder, NewMaskOrder.begin());
   } else {
     assert(Idx == 1 && "Expected either 0 or 1 index.");
-    unsigned Offset = CombinedEntriesWithIndices.back().second;
+    unsigned Offset = CombinedEntriesWithIndices.back().Cnt;
     for (unsigned I : seq<unsigned>(Mask.size())) {
       NewMask[I + Offset] = Mask[I] + Offset;
       NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
@@ -8604,7 +8609,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
                "Expected exactly 2 entries.");
         for (const auto &P : Data.first->CombinedEntriesWithIndices) {
-          TreeEntry &OpTE = *VectorizableTree.back()[P.first];
+          TreeEntry &OpTE = *VectorizableTree.back()[P.Idx];
           OrdersType Order = OpTE.ReorderIndices;
           if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
             if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
@@ -8623,7 +8628,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
           transform(Order, MaskOrder.begin(), [E](unsigned I) {
             return I < E ? static_cast<int>(I) : PoisonMaskElem;
           });
-          Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
+          Data.first->reorderSplitNode(P.Cnt ? 1 : 0, Mask, MaskOrder);
           // Clear ordering of the operand.
           if (!OpTE.ReorderIndices.empty()) {
             OpTE.ReorderIndices.clear();
@@ -14525,16 +14530,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (E->ReorderIndices.empty()) {
       VectorCost = ::getShuffleCost(
           *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
-          E->CombinedEntriesWithIndices.back().second,
-          getWidenedType(ScalarTy,
-                         VectorizableTree
-                             .back()[E->CombinedEntriesWithIndices.back().first]
-                             ->getVectorFactor()));
+          E->CombinedEntriesWithIndices.back().Cnt,
+          getWidenedType(
+              ScalarTy,
+              VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
+                  ->getVectorFactor()));
     } else {
       unsigned CommonVF = std::max(
-          VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first]
+          VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx]
               ->getVectorFactor(),
-          VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first]
+          VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
               ->getVectorFactor());
       VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
                                     getWidenedType(ScalarTy, CommonVF),
@@ -17141,7 +17146,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       if (It != VTEs.end()) {
         const TreeEntry *VTE = *It;
         if (none_of(TE->CombinedEntriesWithIndices,
-                    [&](const auto &P) { return P.first == VTE->Idx; })) {
+                    [&](const auto &P) { return P.Idx == VTE->Idx; })) {
           Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
           if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
             continue;
@@ -17166,7 +17171,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
         VTE = *MIt;
       }
       if (none_of(TE->CombinedEntriesWithIndices,
-                  [&](const auto &P) { return P.first == VTE->Idx; })) {
+                  [&](const auto &P) { return P.Idx == VTE->Idx; })) {
         Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
         if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
             CheckNonSchedulableOrdering(VTE, &LastBundleInst))
@@ -18710,7 +18715,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       E->CombinedEntriesWithIndices.size());
   transform(
       E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
-        return std::make_pair(VectorizableTree.back()[P.first].get(), P.second);
+        return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
       });
   // Build a mask out of the reorder indices and reorder scalars per this
   // mask.
@@ -19295,13 +19300,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
            "Expected exactly 2 combined entries.");
     setInsertPointAfterBundle(E);
     TreeEntry &OpTE1 =
-        *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first];
+        *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx];
     assert(OpTE1.isSame(
                ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
            "Expected same first part of scalars.");
     Value *Op1 = vectorizeTree(&OpTE1);
     TreeEntry &OpTE2 =
-        *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first];
+        *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx];
     assert(
         OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
         "Expected same second part of scalars.");
@@ -19343,8 +19348,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
       std::iota(
           Mask.begin(),
-          std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
-          0);
+          std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().Cnt), 0);
       unsigned ScalarTyNumElements = getNumElements(ScalarTy);
       if (ScalarTyNumElements != 1) {
         assert(SLPReVec && "Only supported by REVEC.");
@@ -19352,7 +19356,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
       Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
       Vec = createInsertVector(Builder, Vec, Op2,
-                               E->CombinedEntriesWithIndices.back().second *
+                               E->CombinedEntriesWithIndices.back().Cnt *
                                    ScalarTyNumElements);
       E->VectorizedValue = Vec;
       return Vec;
@@ -19394,11 +19398,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
         E->CombinedEntriesWithIndices.size());
-    transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
-              [&](const auto &P) {
-                return std::make_pair(VectorizableTree.back()[P.first].get(),
-                                      P.second);
-              });
+    transform(
+        E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
+          return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+        });
     assert(
         (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
         "Expected either combined subnodes or reordering");
@@ -22457,9 +22460,9 @@ bool BoUpSLP::collectValuesToDemote(
   if (E.State == TreeEntry::SplitVectorize)
     return TryProcessInstruction(
         BitWidth,
-        {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().first]
+        {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().Idx]
              .get(),
-         VectorizableTree.back()[E.CombinedEntriesWithIndices.back().first]
+         VectorizableTree.back()[E.CombinedEntriesWithIndices.back().Idx]
              .get()});
 
   if (E.isAltShuffle()) {

>From d50a2f63384c558d9ce186cc875c9ba188e6917d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 06:44:18 -0800
Subject: [PATCH 07/19] [SLP][NFC] Add TNum field to CombineIndex to track the
 tree number

Needed for when there are multiple tree alive.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 62 +++++++++++--------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index edb3188ad7917..6fd6fb24ebb52 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4039,7 +4039,9 @@ class slpvectorizer::BoUpSLP {
     struct CombineIndex {
       unsigned Idx;
       unsigned Cnt;
-      CombineIndex(unsigned Idx, unsigned Cnt) : Idx(Idx), Cnt(Cnt) {}
+      unsigned TNum;
+      CombineIndex(unsigned Idx, unsigned Cnt, unsigned TNum)
+          : Idx(Idx), Cnt(Cnt), TNum(TNum) {}
     };
 
     /// For gather/buildvector/alt opcode nodes, which are combined from
@@ -4306,7 +4308,8 @@ class slpvectorizer::BoUpSLP {
       if (!CombinedEntriesWithIndices.empty()) {
         dbgs() << "Combined entries: ";
         interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
-          dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt;
+          dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt
+                 << " for tree " << P.TNum;
         });
         dbgs() << "\n";
       }
@@ -11627,11 +11630,13 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
                 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
         // Build gather node for loads, they will be gathered later.
         TE->CombinedEntriesWithIndices.emplace_back(
-            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
+            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size(),
+            VectorizableTree.size() - 1);
         (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
       } else {
         TE->CombinedEntriesWithIndices.emplace_back(
-            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
+            VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size(),
+            VectorizableTree.size() - 1);
         buildTreeRec(Op, Depth, {TE, Idx});
       }
     };
@@ -13294,7 +13299,8 @@ void BoUpSLP::transformNodes() {
         if (VF == 2 && AllStrided && Slices.size() > 2)
           continue;
         auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
-          E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
+          E.CombinedEntriesWithIndices.emplace_back(
+              Idx, Cnt, VectorizableTree.size() - 1);
           if (StartIdx == Cnt)
             StartIdx = Cnt + Sz;
           if (End == Cnt + Sz)
@@ -14533,14 +14539,17 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           E->CombinedEntriesWithIndices.back().Cnt,
           getWidenedType(
               ScalarTy,
-              VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
-                  ->getVectorFactor()));
+              VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+                              [E->CombinedEntriesWithIndices.back().Idx]
+                                  ->getVectorFactor()));
     } else {
-      unsigned CommonVF = std::max(
-          VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx]
-              ->getVectorFactor(),
-          VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
-              ->getVectorFactor());
+      unsigned CommonVF =
+          std::max(VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+                                   [E->CombinedEntriesWithIndices.front().Idx]
+                                       ->getVectorFactor(),
+                   VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+                                   [E->CombinedEntriesWithIndices.back().Idx]
+                                       ->getVectorFactor());
       VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
                                     getWidenedType(ScalarTy, CommonVF),
                                     E->getSplitMask(), CostKind);
@@ -18707,15 +18716,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   bool NeedFreeze = false;
   SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
   // Clear values, to be replaced by insertvector instructions.
-  for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
+  for (auto [EIdx, Idx, TNum] : E->CombinedEntriesWithIndices)
     for_each(MutableArrayRef(GatheredScalars)
-                 .slice(Idx, VectorizableTree.back()[EIdx]->getVectorFactor()),
+                 .slice(Idx, VectorizableTree[TNum][EIdx]->getVectorFactor()),
              [&](Value *&V) { V = PoisonValue::get(V->getType()); });
   SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
       E->CombinedEntriesWithIndices.size());
   transform(
       E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
-        return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+        return std::make_pair(VectorizableTree[P.TNum][P.Idx].get(), P.Cnt);
       });
   // Build a mask out of the reorder indices and reorder scalars per this
   // mask.
@@ -19248,8 +19257,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
 }
 
 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
-  for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
-    (void)vectorizeTree(VectorizableTree.back()[EIdx].get());
+  for (auto [EIdx, _, TNum] : E->CombinedEntriesWithIndices)
+    (void)vectorizeTree(VectorizableTree[TNum][EIdx].get());
   return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
                                                                 Builder, *this);
 }
@@ -19300,13 +19309,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
            "Expected exactly 2 combined entries.");
     setInsertPointAfterBundle(E);
     TreeEntry &OpTE1 =
-        *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx];
+        *VectorizableTree[E->CombinedEntriesWithIndices.front().TNum]
+                         [E->CombinedEntriesWithIndices.front().Idx];
     assert(OpTE1.isSame(
                ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
            "Expected same first part of scalars.");
     Value *Op1 = vectorizeTree(&OpTE1);
     TreeEntry &OpTE2 =
-        *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx];
+        *VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+                         [E->CombinedEntriesWithIndices.back().Idx];
     assert(
         OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
         "Expected same second part of scalars.");
@@ -19400,7 +19411,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         E->CombinedEntriesWithIndices.size());
     transform(
         E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
-          return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+          return std::make_pair(VectorizableTree[P.TNum][P.Idx].get(), P.Cnt);
         });
     assert(
         (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
@@ -22459,11 +22470,12 @@ bool BoUpSLP::collectValuesToDemote(
 
   if (E.State == TreeEntry::SplitVectorize)
     return TryProcessInstruction(
-        BitWidth,
-        {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().Idx]
-             .get(),
-         VectorizableTree.back()[E.CombinedEntriesWithIndices.back().Idx]
-             .get()});
+        BitWidth, {VectorizableTree[E.CombinedEntriesWithIndices.front().TNum]
+                                   [E.CombinedEntriesWithIndices.front().Idx]
+                                       .get(),
+                   VectorizableTree[E.CombinedEntriesWithIndices.back().TNum]
+                                   [E.CombinedEntriesWithIndices.back().Idx]
+                                       .get()});
 
   if (E.isAltShuffle()) {
     // Combining these opcodes may lead to incorrect analysis, skip for now.

>From 25c53c309948c00416af65947ce0c60018afc2d3 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 08:03:04 -0800
Subject: [PATCH 08/19] [SLP][NFC] Cost all trees together

Not supported for reduction trees.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6fd6fb24ebb52..54089d7199e48 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16252,9 +16252,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.back().size() << ".\n");
 
+  for (auto &VT : VectorizableTree) {
   SmallPtrSet<Value *, 4> CheckedExtracts;
-  for (unsigned I = 0, E = VectorizableTree.back().size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree.back()[I];
+  for (unsigned I = 0, E = VT.size(); I < E; ++I) {
+    TreeEntry &TE = *VT[I];
     // No need to count the cost for combined entries, they are combined and
     // just skip their cost.
     if (TE.State == TreeEntry::CombinedVectorize) {
@@ -16288,7 +16289,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
                       << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
-  }
+  }}
 
   if (Cost >= -SLPCostThreshold &&
       none_of(ExternalUses, [](const ExternalUser &EU) {
@@ -16510,10 +16511,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
         // block as the root phis, currently vectorized. It allows to keep
         // better ordering info of PHIs, being vectorized currently.
         bool IsProfitablePHIUser =
-            (KeepScalar ||
-             (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
-              VectorizableTree.back().front()->Scalars.size() > 2)) &&
-            VectorizableTree.back().front()->hasState() &&
+            (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+                            Entry->Container.front()->Scalars.size() > 2)) &&
+          Entry->Container.front()->hasState() &&
             VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
             !Inst->hasNUsesOrMore(UsesLimit) &&
             none_of(
@@ -16727,6 +16727,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   // Add the cost for reduced value resize (if required).
   if (ReductionBitWidth != 0) {
     assert(UserIgnoreList && "Expected reduction tree.");
+    assert(VectorizableTree.size() == 1 && "Don't support wide reduction tree");
     const TreeEntry &E = *VectorizableTree.back().front();
     auto It = MinBWs.find(&E);
     if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {

>From 5be6ef69edea6979f2ea63d029f231d6d9ea263e Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 15:54:32 -0800
Subject: [PATCH 09/19] [SLP][NFC] Adjust indentation

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 67 ++++++++++---------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 54089d7199e48..7248bcb90f036 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16253,43 +16253,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
                     << VectorizableTree.back().size() << ".\n");
 
   for (auto &VT : VectorizableTree) {
-  SmallPtrSet<Value *, 4> CheckedExtracts;
-  for (unsigned I = 0, E = VT.size(); I < E; ++I) {
-    TreeEntry &TE = *VT[I];
-    // No need to count the cost for combined entries, they are combined and
-    // just skip their cost.
-    if (TE.State == TreeEntry::CombinedVectorize) {
-      LLVM_DEBUG(
-          dbgs() << "SLP: Skipping cost for combined node that starts with "
-                 << *TE.Scalars[0] << ".\n";
-          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
-      continue;
-    }
-    if (TE.hasState() &&
-        (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
-      if (const TreeEntry *E =
-              getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
-          E && E->getVectorFactor() == TE.getVectorFactor()) {
-        // Some gather nodes might be absolutely the same as some vectorizable
-        // nodes after reordering, need to handle it.
-        LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
-                          << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
-                          << "SLP: Current total cost = " << Cost << "\n");
+    SmallPtrSet<Value *, 4> CheckedExtracts;
+    for (unsigned I = 0, E = VT.size(); I < E; ++I) {
+      TreeEntry &TE = *VT[I];
+      // No need to count the cost for combined entries, they are combined and
+      // just skip their cost.
+      if (TE.State == TreeEntry::CombinedVectorize) {
+        LLVM_DEBUG(
+            dbgs() << "SLP: Skipping cost for combined node that starts with "
+                   << *TE.Scalars[0] << ".\n";
+            TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
         continue;
       }
-    }
+      if (TE.hasState() &&
+          (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
+        if (const TreeEntry *E =
+                getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
+            E && E->getVectorFactor() == TE.getVectorFactor()) {
+          // Some gather nodes might be absolutely the same as some vectorizable
+          // nodes after reordering, need to handle it.
+          LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
+                            << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
+                            << "SLP: Current total cost = " << Cost << "\n");
+          continue;
+        }
+      }
 
-    // Exclude cost of gather loads nodes which are not used. These nodes were
-    // built as part of the final attempt to vectorize gathered loads.
-    assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
-           "Expected gather nodes with users only.");
+      // Exclude cost of gather loads nodes which are not used. These nodes were
+      // built as part of the final attempt to vectorize gathered loads.
+      assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
+             "Expected gather nodes with users only.");
 
-    InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
-    Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
-                      << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
-  }}
+      InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
+      Cost += C;
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
+                        << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
+                        << "SLP: Current total cost = " << Cost << "\n");
+    }
+  }
 
   if (Cost >= -SLPCostThreshold &&
       none_of(ExternalUses, [](const ExternalUser &EU) {

>From 2c73617b92637a0a7793fc0510a8357ddf5c2c75 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 19:27:29 -0800
Subject: [PATCH 10/19] [SLP][NFC] Optionally clear data on calls to
 buildTree()

Don't want to delete data when building multiple trees together.
---
 .../llvm/Transforms/Vectorize/SLPVectorizer.h     |  2 +-
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp   | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..fed187de30384 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -155,7 +155,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
                                           slpvectorizer::BoUpSLP &R,
                                           unsigned Idx, unsigned MinVF,
-                                          unsigned &Size);
+                                          unsigned &Size, bool DeleteTree);
 
   bool vectorizeStores(
       ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7248bcb90f036..8aa82781159a5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2015,7 +2015,7 @@ class slpvectorizer::BoUpSLP {
                  const SmallDenseSet<Value *> &UserIgnoreLst);
 
   /// Construct a vectorizable tree that starts at \p Roots.
-  void buildTree(ArrayRef<Value *> Roots);
+  void buildTree(ArrayRef<Value *> Roots, bool DeleteTree = true);
 
   /// Return the scalars of the root node.
   ArrayRef<Value *> getRootNodeScalars() const {
@@ -9211,8 +9211,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   buildTreeRec(Roots, 0, EdgeInfo());
 }
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
-  deleteTree();
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, bool DeleteTree) {
+  if (DeleteTree)
+    deleteTree();
   assert(TreeEntryToStridedPtrInfoMap.empty() &&
          "TreeEntryToStridedPtrInfoMap is not cleared");
   VectorizableTree.emplace_back();
@@ -23191,7 +23192,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 std::optional<bool>
 SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                        unsigned Idx, unsigned MinVF,
-                                       unsigned &Size) {
+                                       unsigned &Size, bool DeleteTree) {
   Size = 0;
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
@@ -23243,7 +23244,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   }
   if (R.isLoadCombineCandidate(Chain))
     return true;
-  R.buildTree(Chain);
+  R.buildTree(Chain, DeleteTree);
   // Check if tree tiny and store itself or its value is not vectorized.
   if (R.isTreeTinyAndNotFullyVectorizable()) {
     if (R.isGathered(Chain.front()) ||
@@ -23553,8 +23554,8 @@ bool SLPVectorizerPass::vectorizeStores(
                 }
               }
               unsigned TreeSize;
-              std::optional<bool> Res =
-                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
+              std::optional<bool> Res = vectorizeStoreChain(
+                  Slice, R, SliceStartIdx, MinVF, TreeSize, true);
               if (Res && *Res) {
                 if (TreeSize) {
                   InstructionCost Cost = R.getTreeCost();

>From 1d6741acafe0cc0b4e609d6b5a318eab40ad654f Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 12:09:00 -0800
Subject: [PATCH 11/19] [SLP][NFC] Store reference to all VectorizableTree's in
 TreeEntry

Also store corresponding index.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8aa82781159a5..ece64004a3714 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3888,7 +3888,8 @@ class slpvectorizer::BoUpSLP {
 
   class TreeEntry {
   public:
-    TreeEntry(BoUpSLP::VecTreeTy &Container) : Container(Container) {}
+    TreeEntry(SmallVector<BoUpSLP::VecTreeTy> &Container, unsigned CntIdx)
+        : Container(Container), CntIdx(CntIdx) {}
 
     /// \returns Common mask for reorder indices and reused scalars.
     SmallVector<int> getCommonMask() const {
@@ -4028,7 +4029,8 @@ class slpvectorizer::BoUpSLP {
     /// to be a pointer and needs to be able to initialize the child iterator.
     /// Thus we need a reference back to the container to translate the indices
     /// to entries.
-    VecTreeTy &Container;
+    SmallVector<VecTreeTy> &Container;
+    unsigned CntIdx;
 
     /// The TreeEntry index containing the user of this entry.
     EdgeInfo UserTreeIndex;
@@ -4374,8 +4376,8 @@ class slpvectorizer::BoUpSLP {
         S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
         !UserTreeIdx.UserTE)
       return nullptr;
-    VectorizableTree.back().push_back(
-        std::make_unique<TreeEntry>(VectorizableTree.back()));
+    VectorizableTree.back().push_back(std::make_unique<TreeEntry>(
+        VectorizableTree, VectorizableTree.size() - 1));
     TreeEntry *Last = VectorizableTree.back().back().get();
     Last->Idx = VectorizableTree.back().size() - 1;
     Last->State = EntryState;
@@ -6149,11 +6151,11 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
-    return {&N->UserTreeIndex, N->Container};
+    return {&N->UserTreeIndex, N->Container[N->CntIdx]};
   }
 
   static ChildIteratorType child_end(NodeRef N) {
-    return {&N->UserTreeIndex + 1, N->Container};
+    return {&N->UserTreeIndex + 1, N->Container[N->CntIdx]};
   }
 
   /// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -16513,9 +16515,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
         // block as the root phis, currently vectorized. It allows to keep
         // better ordering info of PHIs, being vectorized currently.
         bool IsProfitablePHIUser =
-            (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
-                            Entry->Container.front()->Scalars.size() > 2)) &&
-          Entry->Container.front()->hasState() &&
+            (KeepScalar ||
+             (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+              Entry->Container[Entry->CntIdx].front()->Scalars.size() > 2)) &&
+            Entry->Container[Entry->CntIdx].front()->hasState() &&
             VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
             !Inst->hasNUsesOrMore(UsesLimit) &&
             none_of(

>From b3eca0ef6df810a90f46e683c7820166d7630e62 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 09:04:57 -0800
Subject: [PATCH 12/19] [SLP] Allow store chains with width > VFMax

Break up into VFMax chunks but cost togeher.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 43 +++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ece64004a3714..97ea7a50f5abd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23481,7 +23481,8 @@ bool SLPVectorizerPass::vectorizeStores(
       }
 
       SmallVector<unsigned> CandidateVFs;
-      for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
+      unsigned PowerOf2Elts = bit_floor(Operands.size());
+      for (unsigned VF = std::max(PowerOf2Elts, NonPowerOf2VF); VF >= MinVF;
            VF = divideCeil(VF, 2))
         CandidateVFs.push_back(VF);
 
@@ -23556,9 +23557,43 @@ bool SLPVectorizerPass::vectorizeStores(
                   continue;
                 }
               }
-              unsigned TreeSize;
-              std::optional<bool> Res = vectorizeStoreChain(
-                  Slice, R, SliceStartIdx, MinVF, TreeSize, true);
+              unsigned TreeSize = UINT_MAX;
+              std::optional<bool> Res;
+              if (Slice.size() > std::max(MaxVF, NonPowerOf2VF)) {
+                unsigned EltCnt = Slice.size();
+                auto StartIt = Slice.begin();
+                Res = true;
+                bool DeleteTree = true;
+                while (EltCnt) {
+                  unsigned SubLen = std::min(MaxVF, EltCnt);
+                  EltCnt -= SubLen;
+                  SmallVector<Value *> SubSlice(StartIt, StartIt + SubLen);
+                  unsigned SubTreeSize;
+                  std::optional<bool> SubRes =
+                      vectorizeStoreChain(SubSlice, R, SliceStartIdx, MinVF,
+                                          SubTreeSize, DeleteTree);
+                  DeleteTree = false;
+                  if (TreeSize == UINT_MAX)
+                    TreeSize = SubTreeSize;
+                  else if (TreeSize != SubTreeSize) {
+                    Res = std::nullopt;
+                    break;
+                  }
+                  TreeSize = std::min(TreeSize, SubTreeSize);
+                  StartIt += SubLen;
+                  if (!SubRes) {
+                    Res = std::nullopt;
+                    break;
+                  }
+                  if (!*SubRes) {
+                    Res = false;
+                    break;
+                  }
+                }
+              } else {
+                Res = vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF,
+                                          TreeSize, true);
+              }
               if (Res && *Res) {
                 if (TreeSize) {
                   InstructionCost Cost = R.getTreeCost();

>From eba55c16708a5cc2ec1e2827ccb155ccc47b222d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 12:22:09 -0800
Subject: [PATCH 13/19] [SLP][NFC] Expand LoadEntriesToVectorize to contain two
 indexs for multi-level VectorizableTree

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 97ea7a50f5abd..8b051cb59b011 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4627,7 +4627,7 @@ class slpvectorizer::BoUpSLP {
   /// A list of the load entries (node indices), which can be vectorized using
   /// strided or masked gather approach, but attempted to be represented as
   /// contiguous loads.
-  SetVector<unsigned> LoadEntriesToVectorize;
+  SetVector<std::pair<unsigned, unsigned>> LoadEntriesToVectorize;
 
   /// true if graph nodes transforming mode is on.
   bool IsGraphTransformMode = false;
@@ -7575,7 +7575,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
   }
 
   BoUpSLP::OrdersType Order;
-  if (!LoadEntriesToVectorize.contains(TE.Idx) &&
+  if (!LoadEntriesToVectorize.contains({TE.CntIdx, TE.Idx}) &&
       clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
     return std::move(Order);
   return std::nullopt;
@@ -9387,8 +9387,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
 
   SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
       LoadEntriesToVectorize.size());
-  for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
-    Set.insert_range(VectorizableTree.back()[Idx]->Scalars);
+  for (auto [P, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+    Set.insert_range(VectorizableTree[P.first][P.second]->Scalars);
 
   // Sort loads by distance.
   auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
@@ -9724,7 +9724,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                            if (It == Slice.end())
                              return false;
                            const TreeEntry &TE =
-                               *VectorizableTree.back()[std::get<0>(P)];
+                               *VectorizableTree[std::get<0>(P).first]
+                                                [std::get<0>(P).second];
                            ArrayRef<Value *> VL = TE.Scalars;
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
@@ -9770,8 +9771,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
                            [&](const auto &P) {
                              return !SubSlice.equals(
-                                        VectorizableTree.back()[std::get<0>(P)]
-                                            ->Scalars) &&
+                                        VectorizableTree[std::get<0>(P).first]
+                                                        [std::get<0>(P).second]
+                                                            ->Scalars) &&
                                     set_is_subset(SubSlice, std::get<1>(P));
                            }))
                   continue;
@@ -9820,8 +9822,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     }
   }
   // Try to vectorize postponed load entries, previously marked as gathered.
-  for (unsigned Idx : LoadEntriesToVectorize) {
-    const TreeEntry &E = *VectorizableTree.back()[Idx];
+  for (auto [CntIdx, Idx] : LoadEntriesToVectorize) {
+    const TreeEntry &E = *VectorizableTree[CntIdx][Idx];
     SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
     // Avoid reordering, if possible.
     if (!E.ReorderIndices.empty()) {
@@ -10217,7 +10219,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::CompressVectorize:
       if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+        LoadEntriesToVectorize.insert(
+            {VectorizableTree.size() - 1, VectorizableTree.back().size()});
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10225,7 +10228,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::ScatterVectorize:
       if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+        LoadEntriesToVectorize.insert(
+            {VectorizableTree.size() - 1, VectorizableTree.back().size()});
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10233,7 +10237,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::StridedVectorize:
       if (!IsGraphTransformMode && VectorizableTree.back().size() > 1) {
         // Delay slow vectorized nodes for better vectorization attempts.
-        LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+        LoadEntriesToVectorize.insert(
+            {VectorizableTree.size() - 1, VectorizableTree.back().size()});
         return TreeEntry::NeedToGather;
       }
       return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -13189,7 +13194,8 @@ void BoUpSLP::transformNodes() {
       unsigned MinVF = getMinVF(2 * Sz);
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
-      if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
+      if (VL.size() <= 2 ||
+          LoadEntriesToVectorize.contains({VectorizableTree.size() - 1, Idx}) ||
           !(!E.hasState() || E.getOpcode() == Instruction::Load ||
             // We use allSameOpcode instead of isAltShuffle because we don't
             // want to use interchangeable instruction here.

>From 27350f66760d8b900c10cf7363f55f78d2f582dd Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 13:00:56 -0800
Subject: [PATCH 14/19] [SLP][NFC] Move BoUpSLP::buildExternalUses() to iterate
 over all VectorizableTree's

Only runs once all trees have been generated.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp  |  7 ++++---
 .../PhaseOrdering/AArch64/interleave_vec.ll      |  8 ++++----
 .../SLPVectorizer/AArch64/loadorder.ll           | 16 ++++++++--------
 .../Transforms/SLPVectorizer/AArch64/matmul.ll   | 12 ++++++------
 .../Transforms/SLPVectorizer/AArch64/widen.ll    | 10 +++++-----
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8b051cb59b011..2d1e9140c116f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8960,7 +8960,8 @@ void BoUpSLP::buildExternalUses(
   const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
   DenseMap<Value *, unsigned> ScalarToExtUses;
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree.back()) {
+  for (auto &VT : VectorizableTree) {
+  for (auto &TEPtr : VT) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -9053,7 +9054,7 @@ void BoUpSLP::buildExternalUses(
           break;
       }
     }
-  }
+  }}
 }
 
 SmallVector<SmallVector<StoreInst *>>
@@ -23267,7 +23268,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     R.reorderBottomToTop();
   }
   R.transformNodes();
-  R.buildExternalUses();
 
   R.computeMinimumValueSizes();
 
@@ -23602,6 +23602,7 @@ bool SLPVectorizerPass::vectorizeStores(
               }
               if (Res && *Res) {
                 if (TreeSize) {
+                  R.buildExternalUses();
                   InstructionCost Cost = R.getTreeCost();
 
                   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 2dceb27165c4d..358e3b830ee69 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -823,16 +823,16 @@ define void @same_op8(ptr noalias noundef %a, ptr noundef %b, ptr noundef %c) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX6_4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX9_4:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX6_4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX9_4:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX6_4]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index bb05440910130..fb5109deb08e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -1125,24 +1125,24 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
 ; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i8> [[TMP32]] to <4 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = mul <4 x i32> [[TMP33]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = zext <4 x i8> [[TMP37]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = mul <4 x i32> [[TMP38]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; CHECK-NEXT:    [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; CHECK-NEXT:    [[TMP46:%.*]] = mul <4 x i32> [[TMP43]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = mul <4 x i32> [[TMP48]], [[TMP50]]
+; CHECK-NEXT:    [[TMP36:%.*]] = mul <4 x i32> [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul <4 x i32> [[TMP35]], [[TMP40]]
+; CHECK-NEXT:    [[TMP46:%.*]] = mul <4 x i32> [[TMP43]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = mul <4 x i32> [[TMP45]], [[TMP50]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP36]], ptr [[DST0]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP41]], ptr [[DST4]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP46]], ptr [[DST8]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
index 10f07f158175d..69c5812e57122 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -20,6 +20,9 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
 ; CHECK-NEXT:    [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
 ; CHECK-NEXT:    [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
+; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
+; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
@@ -29,15 +32,11 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]]
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[OUT]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]]
@@ -45,11 +44,12 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]]
-; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
 ; CHECK-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[OUT]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
index b8bf38af3668d..5bea948ef2382 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
@@ -13,13 +13,13 @@ define void @PR50256(ptr %a, ptr %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX3_8:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[A]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i16> [[TMP3]], splat (i16 8)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_8]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP8]], splat (i16 8)
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[B]], align 2
-; CHECK-NEXT:    store <8 x i16> [[TMP9]], ptr [[ARRAYIDX3_8]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw <8 x i16> [[TMP4]], splat (i16 8)
+; CHECK-NEXT:    store <8 x i16> [[TMP9]], ptr [[B]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[ARRAYIDX3_8]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1

>From 52d5e9559e54c6d1aa640e7549294737f0e5603b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Sat, 27 Dec 2025 22:54:38 -0800
Subject: [PATCH 15/19] [SLP][NFC] Adjust indentation

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 159 +++++++++---------
 1 file changed, 80 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2d1e9140c116f..106012fb42b9c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8961,100 +8961,101 @@ void BoUpSLP::buildExternalUses(
   DenseMap<Value *, unsigned> ScalarToExtUses;
   // Collect the values that we need to extract from the tree.
   for (auto &VT : VectorizableTree) {
-  for (auto &TEPtr : VT) {
-    TreeEntry *Entry = TEPtr.get();
-
-    // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
-      continue;
-
-    // For each lane:
-    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
-      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
-        continue;
+    for (auto &TEPtr : VT) {
+      TreeEntry *Entry = TEPtr.get();
 
-      // All uses must be replaced already? No need to do it again.
-      auto It = ScalarToExtUses.find(Scalar);
-      if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
+      // No need to handle users of gathered values.
+      if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
         continue;
 
-      if (Scalar->hasNUsesOrMore(NumVectScalars)) {
-        unsigned FoundLane = Entry->findLaneForValue(Scalar);
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
-                          << " from " << *Scalar << "for many users.\n");
-        It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
-        ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
-        ExternalUsesWithNonUsers.insert(Scalar);
-        continue;
-      }
+      // For each lane:
+      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+        Value *Scalar = Entry->Scalars[Lane];
+        if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
+          continue;
 
-      // Check if the scalar is externally used as an extra arg.
-      const auto ExtI = ExternallyUsedValues.find(Scalar);
-      if (ExtI != ExternallyUsedValues.end()) {
-        unsigned FoundLane = Entry->findLaneForValue(Scalar);
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
-                          << FoundLane << " from " << *Scalar << ".\n");
-        ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
-        ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
-        continue;
-      }
-      for (User *U : Scalar->users()) {
-        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        // All uses must be replaced already? No need to do it again.
+        auto It = ScalarToExtUses.find(Scalar);
+        if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
+          continue;
 
-        Instruction *UserInst = dyn_cast<Instruction>(U);
-        if (!UserInst || isDeleted(UserInst))
+        if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+          unsigned FoundLane = Entry->findLaneForValue(Scalar);
+          LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
+                            << " from " << *Scalar << "for many users.\n");
+          It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+          ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
+          ExternalUsesWithNonUsers.insert(Scalar);
           continue;
+        }
 
-        // Ignore users in the user ignore list.
-        if (UserIgnoreList && UserIgnoreList->contains(UserInst))
+        // Check if the scalar is externally used as an extra arg.
+        const auto ExtI = ExternallyUsedValues.find(Scalar);
+        if (ExtI != ExternallyUsedValues.end()) {
+          unsigned FoundLane = Entry->findLaneForValue(Scalar);
+          LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+                            << FoundLane << " from " << *Scalar << ".\n");
+          ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
+          ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
           continue;
+        }
+        for (User *U : Scalar->users()) {
+          LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
-        // Skip in-tree scalars that become vectors
-        if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
-            !UseEntries.empty()) {
-          // Some in-tree scalars will remain as scalar in vectorized
-          // instructions. If that is the case, the one in FoundLane will
-          // be used.
-          if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
-                 isa<LoadInst, StoreInst>(UserInst)) ||
-                isa<CallInst>(UserInst)) ||
-              all_of(UseEntries, [&](TreeEntry *UseEntry) {
-                return UseEntry->State == TreeEntry::ScatterVectorize ||
-                       !doesInTreeUserNeedToExtract(
-                           Scalar, getRootEntryInstruction(*UseEntry), TLI,
-                           TTI);
-              })) {
-            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
-                              << ".\n");
-            assert(none_of(UseEntries,
-                           [](TreeEntry *UseEntry) {
-                             return UseEntry->isGather();
-                           }) &&
-                   "Bad state");
+          Instruction *UserInst = dyn_cast<Instruction>(U);
+          if (!UserInst || isDeleted(UserInst))
+            continue;
+
+          // Ignore users in the user ignore list.
+          if (UserIgnoreList && UserIgnoreList->contains(UserInst))
             continue;
+
+          // Skip in-tree scalars that become vectors
+          if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
+              !UseEntries.empty()) {
+            // Some in-tree scalars will remain as scalar in vectorized
+            // instructions. If that is the case, the one in FoundLane will
+            // be used.
+            if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
+                   isa<LoadInst, StoreInst>(UserInst)) ||
+                  isa<CallInst>(UserInst)) ||
+                all_of(UseEntries, [&](TreeEntry *UseEntry) {
+                  return UseEntry->State == TreeEntry::ScatterVectorize ||
+                         !doesInTreeUserNeedToExtract(
+                             Scalar, getRootEntryInstruction(*UseEntry), TLI,
+                             TTI);
+                })) {
+              LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                                << ".\n");
+              assert(none_of(UseEntries,
+                             [](TreeEntry *UseEntry) {
+                               return UseEntry->isGather();
+                             }) &&
+                     "Bad state");
+              continue;
+            }
+            U = nullptr;
+            if (It != ScalarToExtUses.end()) {
+              ExternalUses[It->second].User = nullptr;
+              break;
+            }
           }
-          U = nullptr;
-          if (It != ScalarToExtUses.end()) {
-            ExternalUses[It->second].User = nullptr;
+
+          if (U && Scalar->hasNUsesOrMore(UsesLimit))
+            U = nullptr;
+          unsigned FoundLane = Entry->findLaneForValue(Scalar);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Need to extract:" << *UserInst << " from lane "
+                     << FoundLane << " from " << *Scalar << ".\n");
+          It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+          ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
+          ExternalUsesWithNonUsers.insert(Scalar);
+          if (!U)
             break;
-          }
         }
-
-        if (U && Scalar->hasNUsesOrMore(UsesLimit))
-          U = nullptr;
-        unsigned FoundLane = Entry->findLaneForValue(Scalar);
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
-                          << " from lane " << FoundLane << " from " << *Scalar
-                          << ".\n");
-        It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
-        ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
-        ExternalUsesWithNonUsers.insert(Scalar);
-        if (!U)
-          break;
       }
     }
-  }}
+  }
 }
 
 SmallVector<SmallVector<StoreInst *>>

>From 1a4a6bfea4cd0b639447ed54ec9b56e245959a9d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 29 Dec 2025 16:07:47 -0800
Subject: [PATCH 16/19] [SLP] Update TransformNodes() to operate on all
 VectorizableTree's at once

Supports gathering across trees.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 92 ++++++++++---------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 106012fb42b9c..d510ac119e1f5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13079,19 +13079,11 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
 }
 
 void BoUpSLP::transformNodes() {
+  auto withinNodeTransform = [&](VecTreeTy &VT) -> bool {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  BaseGraphSize = VectorizableTree.back().size();
-  // Turn graph transforming mode on and off, when done.
-  class GraphTransformModeRAAI {
-    bool &SavedIsGraphTransformMode;
+  BaseGraphSize = VT.size();
 
-  public:
-    GraphTransformModeRAAI(bool &IsGraphTransformMode)
-        : SavedIsGraphTransformMode(IsGraphTransformMode) {
-      IsGraphTransformMode = true;
-    }
-    ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
-  } TransformContext(IsGraphTransformMode);
+  // Turn graph transforming mode on and off, when done.
   // Operands are profitable if they are:
   // 1. At least one constant
   // or
@@ -13118,7 +13110,7 @@ void BoUpSLP::transformNodes() {
 
   // Try to reorder gather nodes for better vectorization opportunities.
   for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VectorizableTree.back()[Idx];
+    TreeEntry &E = *VT[Idx];
     if (E.isGather())
       reorderGatherNode(E);
   }
@@ -13127,12 +13119,11 @@ void BoUpSLP::transformNodes() {
   // gathered nodes each having less than 16 elements.
   constexpr unsigned VFLimit = 16;
   bool ForceLoadGather =
-      count_if(VectorizableTree.back(),
-               [&](const std::unique_ptr<TreeEntry> &TE) {
-                 return TE->isGather() && TE->hasState() &&
-                        TE->getOpcode() == Instruction::Load &&
-                        TE->getVectorFactor() < VFLimit;
-               }) == 2;
+      count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
+        return TE->isGather() && TE->hasState() &&
+               TE->getOpcode() == Instruction::Load &&
+               TE->getVectorFactor() < VFLimit;
+      }) == 2;
 
   // Checks if the scalars are used in other node.
   auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
@@ -13189,15 +13180,14 @@ void BoUpSLP::transformNodes() {
   };
   // The tree may grow here, so iterate over nodes, built before.
   for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VectorizableTree.back()[Idx];
+    TreeEntry &E = *VT[Idx];
     if (E.isGather()) {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
       unsigned MinVF = getMinVF(2 * Sz);
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
-      if (VL.size() <= 2 ||
-          LoadEntriesToVectorize.contains({VectorizableTree.size() - 1, Idx}) ||
+      if (VL.size() <= 2 || LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
           !(!E.hasState() || E.getOpcode() == Instruction::Load ||
             // We use allSameOpcode instead of isAltShuffle because we don't
             // want to use interchangeable instruction here.
@@ -13325,19 +13315,19 @@ void BoUpSLP::transformNodes() {
             // If any instruction is vectorized already - do not try again.
             SameTE = getSameValuesTreeEntry(*It, Slice);
           }
-          unsigned PrevSize = VectorizableTree.back().size();
+          unsigned PrevSize = VT.size();
           [[maybe_unused]] unsigned PrevEntriesSize =
               LoadEntriesToVectorize.size();
           buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
-          if (PrevSize + 1 == VectorizableTree.back().size() && !SameTE &&
-              VectorizableTree.back()[PrevSize]->isGather() &&
-              VectorizableTree.back()[PrevSize]->hasState() &&
-              VectorizableTree.back()[PrevSize]->getOpcode() !=
+          if (PrevSize + 1 == VT.size() && !SameTE &&
+              VT[PrevSize]->isGather() &&
+              VT[PrevSize]->hasState() &&
+              VT[PrevSize]->getOpcode() !=
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
             if (UserIgnoreList && E.Idx == 0 && VF == 2)
               analyzedReductionVals(Slice);
-            VectorizableTree.back().pop_back();
+            VT.pop_back();
             assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
                    "LoadEntriesToVectorize expected to remain the same");
             continue;
@@ -13490,30 +13480,47 @@ void BoUpSLP::transformNodes() {
 
   if (LoadEntriesToVectorize.empty()) {
     // Single load node - exit.
-    if (VectorizableTree.back().size() <= 1 &&
-        VectorizableTree.back().front()->hasState() &&
-        VectorizableTree.back().front()->getOpcode() == Instruction::Load)
-      return;
+    if (VT.size() <= 1 && VT.front()->hasState() &&
+        VT.front()->getOpcode() == Instruction::Load)
+      return false;
     // Small graph with small VF - exit.
     constexpr unsigned SmallTree = 3;
     constexpr unsigned SmallVF = 2;
-    if ((VectorizableTree.back().size() <= SmallTree &&
-         VectorizableTree.back().front()->Scalars.size() == SmallVF) ||
-        (VectorizableTree.back().size() <= 2 && UserIgnoreList))
-      return;
+    if ((VT.size() <= SmallTree &&
+         VT.front()->Scalars.size() == SmallVF) ||
+        (VT.size() <= 2 && UserIgnoreList))
+      return false;
 
-    if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
+    if (VT.front()->isNonPowOf2Vec() &&
         getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
         getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VectorizableTree.back())
-                     .drop_front(getCanonicalGraphSize()),
+        count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
                    return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
-      return;
+      return false;
   }
+  return true;
+  };
+
+  class GraphTransformModeRAAI {
+    bool &SavedIsGraphTransformMode;
+
+  public:
+    GraphTransformModeRAAI(bool &IsGraphTransformMode)
+        : SavedIsGraphTransformMode(IsGraphTransformMode) {
+      IsGraphTransformMode = true;
+    }
+    ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+  } TransformContext(IsGraphTransformMode);
+
+  bool Cont = true;
+  for (auto &VT : VectorizableTree)
+    Cont |= withinNodeTransform(VT);
+  if (!Cont)
+    return;
 
   // A list of loads to be gathered during the vectorization process. We can
   // try to vectorize them at the end, if profitable.
@@ -13521,7 +13528,8 @@ void BoUpSLP::transformNodes() {
                  SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
       GatheredLoads;
 
-  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
+  for (auto &VT : VectorizableTree) {
+    for (std::unique_ptr<TreeEntry> &TE : VT) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
         ((E.hasState() && E.getOpcode() == Instruction::Load) ||
@@ -13546,7 +13554,7 @@ void BoUpSLP::transformNodes() {
                 LI->getType())]);
       }
     }
-  }
+  }}
   // Try to vectorize gathered loads if this is not just a gather of loads.
   if (!GatheredLoads.empty())
     tryToVectorizeGatheredLoads(GatheredLoads);
@@ -23268,7 +23276,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     R.reorderTopToBottom();
     R.reorderBottomToTop();
   }
-  R.transformNodes();
 
   R.computeMinimumValueSizes();
 
@@ -23603,6 +23610,7 @@ bool SLPVectorizerPass::vectorizeStores(
               }
               if (Res && *Res) {
                 if (TreeSize) {
+                  R.transformNodes();
                   R.buildExternalUses();
                   InstructionCost Cost = R.getTreeCost();
 

>From fa98d77535b2976cc422e0e49d03540a23f706c1 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 29 Dec 2025 16:13:35 -0800
Subject: [PATCH 17/19] [SLP][NFC] Adjust indentation

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 823 +++++++++---------
 1 file changed, 416 insertions(+), 407 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d510ac119e1f5..0239ecd857ff7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13080,91 +13080,78 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
 
 void BoUpSLP::transformNodes() {
   auto withinNodeTransform = [&](VecTreeTy &VT) -> bool {
-  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  BaseGraphSize = VT.size();
-
-  // Turn graph transforming mode on and off, when done.
-  // Operands are profitable if they are:
-  // 1. At least one constant
-  // or
-  // 2. Splats
-  // or
-  // 3. Results in good vectorization opportunity, i.e. may generate vector
-  // nodes and reduce cost of the graph.
-  auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
-                                           const InstructionsState &S) {
-    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
-    for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
-      Candidates.emplace_back().emplace_back(I1->getOperand(Op),
-                                             I2->getOperand(Op));
-    return all_of(
-        Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
-          return all_of(Cand,
-                        [](const std::pair<Value *, Value *> &P) {
-                          return isa<Constant>(P.first) ||
-                                 isa<Constant>(P.second) || P.first == P.second;
-                        }) ||
-                 findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
-        });
-  };
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    BaseGraphSize = VT.size();
+
+    // Turn graph transforming mode on and off, when done.
+    // Operands are profitable if they are:
+    // 1. At least one constant
+    // or
+    // 2. Splats
+    // or
+    // 3. Results in good vectorization opportunity, i.e. may generate vector
+    // nodes and reduce cost of the graph.
+    auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
+                                             const InstructionsState &S) {
+      SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+      for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
+        Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+                                               I2->getOperand(Op));
+      return all_of(
+          Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+            return all_of(Cand,
+                          [](const std::pair<Value *, Value *> &P) {
+                            return isa<Constant>(P.first) ||
+                                   isa<Constant>(P.second) ||
+                                   P.first == P.second;
+                          }) ||
+                   findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
+          });
+    };
 
-  // Try to reorder gather nodes for better vectorization opportunities.
-  for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VT[Idx];
-    if (E.isGather())
-      reorderGatherNode(E);
-  }
-
-  // Better to use full gathered loads analysis, if there are only 2 loads
-  // gathered nodes each having less than 16 elements.
-  constexpr unsigned VFLimit = 16;
-  bool ForceLoadGather =
-      count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->isGather() && TE->hasState() &&
-               TE->getOpcode() == Instruction::Load &&
-               TE->getVectorFactor() < VFLimit;
-      }) == 2;
-
-  // Checks if the scalars are used in other node.
-  auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
-                              function_ref<bool(Value *)> CheckContainer) {
-    return TE->isSame(VL) || all_of(VL, [&](Value *V) {
-             if (isa<PoisonValue>(V))
-               return true;
-             auto *I = dyn_cast<Instruction>(V);
-             if (!I)
-               return false;
-             return is_contained(TE->Scalars, I) || CheckContainer(I);
-           });
-  };
-  auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
-    if (E.hasState()) {
-      if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
-          !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
-            return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
-              ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
-              return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
-                return is_contained(TEs, TE);
-              });
-            });
-          }))
-        return true;
-      ;
-      if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
-          !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
-            return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
-              ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
-              return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
-                return is_contained(TEs, TE);
+    // Try to reorder gather nodes for better vectorization opportunities.
+    for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+      TreeEntry &E = *VT[Idx];
+      if (E.isGather())
+        reorderGatherNode(E);
+    }
+
+    // Better to use full gathered loads analysis, if there are only 2 loads
+    // gathered nodes each having less than 16 elements.
+    constexpr unsigned VFLimit = 16;
+    bool ForceLoadGather =
+        count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
+          return TE->isGather() && TE->hasState() &&
+                 TE->getOpcode() == Instruction::Load &&
+                 TE->getVectorFactor() < VFLimit;
+        }) == 2;
+
+    // Checks if the scalars are used in other node.
+    auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
+                                function_ref<bool(Value *)> CheckContainer) {
+      return TE->isSame(VL) || all_of(VL, [&](Value *V) {
+               if (isa<PoisonValue>(V))
+                 return true;
+               auto *I = dyn_cast<Instruction>(V);
+               if (!I)
+                 return false;
+               return is_contained(TE->Scalars, I) || CheckContainer(I);
+             });
+    };
+    auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
+      if (E.hasState()) {
+        if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
+            !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
+              return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
+                ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
+                return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
+                  return is_contained(TEs, TE);
+                });
               });
-            });
-          }))
-        return true;
-    } else {
-      // Check if the gather node full copy of split node.
-      auto *It = find_if(E.Scalars, IsaPred<Instruction>);
-      if (It != E.Scalars.end()) {
-        if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
+            }))
+          return true;
+        ;
+        if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
             !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
               return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
                 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
@@ -13174,335 +13161,355 @@ void BoUpSLP::transformNodes() {
               });
             }))
           return true;
+      } else {
+        // Check if the gather node full copy of split node.
+        auto *It = find_if(E.Scalars, IsaPred<Instruction>);
+        if (It != E.Scalars.end()) {
+          if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
+              !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
+                return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
+                  ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
+                  return !VTEs.empty() &&
+                         any_of(VTEs, [&](const TreeEntry *TE) {
+                           return is_contained(TEs, TE);
+                         });
+                });
+              }))
+            return true;
+        }
       }
-    }
-    return false;
-  };
-  // The tree may grow here, so iterate over nodes, built before.
-  for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
-    TreeEntry &E = *VT[Idx];
-    if (E.isGather()) {
-      ArrayRef<Value *> VL = E.Scalars;
-      const unsigned Sz = getVectorElementSize(VL.front());
-      unsigned MinVF = getMinVF(2 * Sz);
-      // Do not try partial vectorization for small nodes (<= 2), nodes with the
-      // same opcode and same parent block or all constants.
-      if (VL.size() <= 2 || LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
-          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
-            // We use allSameOpcode instead of isAltShuffle because we don't
-            // want to use interchangeable instruction here.
-            !allSameOpcode(VL) || !allSameBlock(VL)) ||
-          allConstant(VL) || isSplat(VL))
-        continue;
-      if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
-        continue;
-      // Check if the node is a copy of other vector nodes.
-      if (CheckForSameVectorNodes(E))
-        continue;
-      // Try to find vectorizable sequences and transform them into a series of
-      // insertvector instructions.
-      unsigned StartIdx = 0;
-      unsigned End = VL.size();
-      for (unsigned VF = getFloorFullVectorNumberOfElements(
-               *TTI, VL.front()->getType(), VL.size() - 1);
-           VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
-                            *TTI, VL.front()->getType(), VF - 1)) {
-        if (StartIdx + VF > End)
+      return false;
+    };
+    // The tree may grow here, so iterate over nodes, built before.
+    for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+      TreeEntry &E = *VT[Idx];
+      if (E.isGather()) {
+        ArrayRef<Value *> VL = E.Scalars;
+        const unsigned Sz = getVectorElementSize(VL.front());
+        unsigned MinVF = getMinVF(2 * Sz);
+        // Do not try partial vectorization for small nodes (<= 2), nodes with
+        // the same opcode and same parent block or all constants.
+        if (VL.size() <= 2 ||
+            LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
+            !(!E.hasState() || E.getOpcode() == Instruction::Load ||
+              // We use allSameOpcode instead of isAltShuffle because we don't
+              // want to use interchangeable instruction here.
+              !allSameOpcode(VL) || !allSameBlock(VL)) ||
+            allConstant(VL) || isSplat(VL))
           continue;
-        SmallVector<std::pair<unsigned, unsigned>> Slices;
-        bool AllStrided = true;
-        for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
-          // If any instruction is vectorized already - do not try again.
-          // Reuse the existing node, if it fully matches the slice.
-          if (isVectorized(Slice.front()) &&
-              !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
-            continue;
-          // Constant already handled effectively - skip.
-          if (allConstant(Slice))
+        if (ForceLoadGather && E.hasState() &&
+            E.getOpcode() == Instruction::Load)
+          continue;
+        // Check if the node is a copy of other vector nodes.
+        if (CheckForSameVectorNodes(E))
+          continue;
+        // Try to find vectorizable sequences and transform them into a series
+        // of insertvector instructions.
+        unsigned StartIdx = 0;
+        unsigned End = VL.size();
+        for (unsigned VF = getFloorFullVectorNumberOfElements(
+                 *TTI, VL.front()->getType(), VL.size() - 1);
+             VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
+                              *TTI, VL.front()->getType(), VF - 1)) {
+          if (StartIdx + VF > End)
             continue;
-          // Do not try to vectorize small splats (less than vector register and
-          // only with the single non-undef element).
-          bool IsSplat = isSplat(Slice);
-          bool IsTwoRegisterSplat = true;
-          if (IsSplat && VF == 2) {
-            unsigned NumRegs2VF = ::getNumberOfParts(
-                *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
-            IsTwoRegisterSplat = NumRegs2VF == 2;
-          }
-          if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
-              count(Slice, Slice.front()) ==
-                  static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
-                                                                   : 1)) {
-            if (IsSplat)
+          SmallVector<std::pair<unsigned, unsigned>> Slices;
+          bool AllStrided = true;
+          for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
+            ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+            // If any instruction is vectorized already - do not try again.
+            // Reuse the existing node, if it fully matches the slice.
+            if (isVectorized(Slice.front()) &&
+                !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
               continue;
-            InstructionsState S = getSameOpcode(Slice, *TLI);
-            if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
-                (S.getOpcode() == Instruction::Load &&
-                 areKnownNonVectorizableLoads(Slice)) ||
-                (S.getOpcode() != Instruction::Load &&
-                 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
+            // Constant already handled effectively - skip.
+            if (allConstant(Slice))
               continue;
-            if (VF == 2) {
-              // Try to vectorize reduced values or if all users are vectorized.
-              // For expensive instructions extra extracts might be profitable.
-              if ((!UserIgnoreList || E.Idx != 0) &&
-                  TTI->getInstructionCost(S.getMainOp(), CostKind) <
-                      TTI::TCC_Expensive &&
-                  !all_of(Slice, [&](Value *V) {
-                    if (isa<PoisonValue>(V))
-                      return true;
-                    return areAllUsersVectorized(cast<Instruction>(V),
-                                                 UserIgnoreList);
-                  }))
+            // Do not try to vectorize small splats (less than vector register
+            // and only with the single non-undef element).
+            bool IsSplat = isSplat(Slice);
+            bool IsTwoRegisterSplat = true;
+            if (IsSplat && VF == 2) {
+              unsigned NumRegs2VF = ::getNumberOfParts(
+                  *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
+              IsTwoRegisterSplat = NumRegs2VF == 2;
+            }
+            if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
+                count(Slice, Slice.front()) ==
+                    static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
+                                                                     : 1)) {
+              if (IsSplat)
                 continue;
-              if (S.getOpcode() == Instruction::Load) {
-                OrdersType Order;
-                SmallVector<Value *> PointerOps;
-                StridedPtrInfo SPtrInfo;
-                LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
-                                                   PointerOps, SPtrInfo);
-                AllStrided &= Res == LoadsState::StridedVectorize ||
-                              Res == LoadsState::ScatterVectorize ||
-                              Res == LoadsState::Gather;
-                // Do not vectorize gathers.
-                if (Res == LoadsState::ScatterVectorize ||
-                    Res == LoadsState::Gather) {
-                  if (Res == LoadsState::Gather) {
-                    registerNonVectorizableLoads(Slice);
-                    // If reductions and the scalars from the root node are
-                    // analyzed - mark as non-vectorizable reduction.
-                    if (UserIgnoreList && E.Idx == 0)
-                      analyzedReductionVals(Slice);
+              InstructionsState S = getSameOpcode(Slice, *TLI);
+              if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
+                  (S.getOpcode() == Instruction::Load &&
+                   areKnownNonVectorizableLoads(Slice)) ||
+                  (S.getOpcode() != Instruction::Load &&
+                   !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
+                                             VF)))
+                continue;
+              if (VF == 2) {
+                // Try to vectorize reduced values or if all users are
+                // vectorized. For expensive instructions extra extracts might
+                // be profitable.
+                if ((!UserIgnoreList || E.Idx != 0) &&
+                    TTI->getInstructionCost(S.getMainOp(), CostKind) <
+                        TTI::TCC_Expensive &&
+                    !all_of(Slice, [&](Value *V) {
+                      if (isa<PoisonValue>(V))
+                        return true;
+                      return areAllUsersVectorized(cast<Instruction>(V),
+                                                   UserIgnoreList);
+                    }))
+                  continue;
+                if (S.getOpcode() == Instruction::Load) {
+                  OrdersType Order;
+                  SmallVector<Value *> PointerOps;
+                  StridedPtrInfo SPtrInfo;
+                  LoadsState Res = canVectorizeLoads(
+                      Slice, Slice.front(), Order, PointerOps, SPtrInfo);
+                  AllStrided &= Res == LoadsState::StridedVectorize ||
+                                Res == LoadsState::ScatterVectorize ||
+                                Res == LoadsState::Gather;
+                  // Do not vectorize gathers.
+                  if (Res == LoadsState::ScatterVectorize ||
+                      Res == LoadsState::Gather) {
+                    if (Res == LoadsState::Gather) {
+                      registerNonVectorizableLoads(Slice);
+                      // If reductions and the scalars from the root node are
+                      // analyzed - mark as non-vectorizable reduction.
+                      if (UserIgnoreList && E.Idx == 0)
+                        analyzedReductionVals(Slice);
+                    }
+                    continue;
                   }
+                } else if (S.getOpcode() == Instruction::ExtractElement ||
+                           (TTI->getInstructionCost(S.getMainOp(), CostKind) <
+                                TTI::TCC_Expensive &&
+                            !CheckOperandsProfitability(
+                                S.getMainOp(),
+                                cast<Instruction>(*find_if(
+                                    reverse(Slice), IsaPred<Instruction>)),
+                                S))) {
+                  // Do not vectorize extractelements (handled effectively
+                  // alread). Do not vectorize non-profitable instructions (with
+                  // low cost and non-vectorizable operands.)
                   continue;
                 }
-              } else if (S.getOpcode() == Instruction::ExtractElement ||
-                         (TTI->getInstructionCost(S.getMainOp(), CostKind) <
-                              TTI::TCC_Expensive &&
-                          !CheckOperandsProfitability(
-                              S.getMainOp(),
-                              cast<Instruction>(*find_if(reverse(Slice),
-                                                         IsaPred<Instruction>)),
-                              S))) {
-                // Do not vectorize extractelements (handled effectively
-                // alread). Do not vectorize non-profitable instructions (with
-                // low cost and non-vectorizable operands.)
-                continue;
               }
             }
+            Slices.emplace_back(Cnt, Slice.size());
           }
-          Slices.emplace_back(Cnt, Slice.size());
-        }
-        // Do not try to vectorize if all slides are strided or gathered with
-        // vector factor 2 and there are more than 2 slices. Better to handle
-        // them in gathered loads analysis, may result in better vectorization.
-        if (VF == 2 && AllStrided && Slices.size() > 2)
-          continue;
-        auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
-          E.CombinedEntriesWithIndices.emplace_back(
-              Idx, Cnt, VectorizableTree.size() - 1);
-          if (StartIdx == Cnt)
-            StartIdx = Cnt + Sz;
-          if (End == Cnt + Sz)
-            End = Cnt;
-        };
-        for (auto [Cnt, Sz] : Slices) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
-          const TreeEntry *SameTE = nullptr;
-          if (const auto *It = find_if(Slice, IsaPred<Instruction>);
-              It != Slice.end()) {
-            // If any instruction is vectorized already - do not try again.
-            SameTE = getSameValuesTreeEntry(*It, Slice);
-          }
-          unsigned PrevSize = VT.size();
-          [[maybe_unused]] unsigned PrevEntriesSize =
-              LoadEntriesToVectorize.size();
-          buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
-          if (PrevSize + 1 == VT.size() && !SameTE &&
-              VT[PrevSize]->isGather() &&
-              VT[PrevSize]->hasState() &&
-              VT[PrevSize]->getOpcode() !=
-                  Instruction::ExtractElement &&
-              !isSplat(Slice)) {
-            if (UserIgnoreList && E.Idx == 0 && VF == 2)
-              analyzedReductionVals(Slice);
-            VT.pop_back();
-            assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
-                   "LoadEntriesToVectorize expected to remain the same");
+          // Do not try to vectorize if all slides are strided or gathered with
+          // vector factor 2 and there are more than 2 slices. Better to handle
+          // them in gathered loads analysis, may result in better
+          // vectorization.
+          if (VF == 2 && AllStrided && Slices.size() > 2)
             continue;
+          auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
+            E.CombinedEntriesWithIndices.emplace_back(
+                Idx, Cnt, VectorizableTree.size() - 1);
+            if (StartIdx == Cnt)
+              StartIdx = Cnt + Sz;
+            if (End == Cnt + Sz)
+              End = Cnt;
+          };
+          for (auto [Cnt, Sz] : Slices) {
+            ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
+            const TreeEntry *SameTE = nullptr;
+            if (const auto *It = find_if(Slice, IsaPred<Instruction>);
+                It != Slice.end()) {
+              // If any instruction is vectorized already - do not try again.
+              SameTE = getSameValuesTreeEntry(*It, Slice);
+            }
+            unsigned PrevSize = VT.size();
+            [[maybe_unused]] unsigned PrevEntriesSize =
+                LoadEntriesToVectorize.size();
+            buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
+            if (PrevSize + 1 == VT.size() && !SameTE &&
+                VT[PrevSize]->isGather() && VT[PrevSize]->hasState() &&
+                VT[PrevSize]->getOpcode() != Instruction::ExtractElement &&
+                !isSplat(Slice)) {
+              if (UserIgnoreList && E.Idx == 0 && VF == 2)
+                analyzedReductionVals(Slice);
+              VT.pop_back();
+              assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                     "LoadEntriesToVectorize expected to remain the same");
+              continue;
+            }
+            AddCombinedNode(PrevSize, Cnt, Sz);
           }
-          AddCombinedNode(PrevSize, Cnt, Sz);
         }
-      }
-      // Restore ordering, if no extra vectorization happened.
-      if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
-        SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
-        reorderScalars(E.Scalars, Mask);
-        E.ReorderIndices.clear();
-      }
-    }
-    if (!E.hasState())
-      continue;
-    switch (E.getOpcode()) {
-    case Instruction::Load: {
-      // No need to reorder masked gather loads, just reorder the scalar
-      // operands.
-      if (E.State != TreeEntry::Vectorize)
-        break;
-      Type *ScalarTy = E.getMainOp()->getType();
-      auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
-      Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
-      // Check if profitable to represent consecutive load + reverse as strided
-      // load with stride -1.
-      if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
-          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
-        SmallVector<int> Mask;
-        inversePermutation(E.ReorderIndices, Mask);
-        auto *BaseLI = cast<LoadInst>(E.Scalars.back());
-        InstructionCost OriginalVecCost =
-            TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
-                                 BaseLI->getPointerAddressSpace(), CostKind,
-                                 TTI::OperandValueInfo()) +
-            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
-        InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
-            MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
-                                       VecTy, BaseLI->getPointerOperand(),
-                                       /*VariableMask=*/false, CommonAlignment,
-                                       BaseLI),
-            CostKind);
-        if (StridedCost < OriginalVecCost || ForceStridedLoads) {
-          // Strided load is more profitable than consecutive load + reverse -
-          // transform the node to strided load.
-          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
-                                                ->getPointerOperand()
-                                                ->getType());
-          StridedPtrInfo SPtrInfo;
-          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
-          SPtrInfo.Ty = VecTy;
-          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
-          E.State = TreeEntry::StridedVectorize;
+        // Restore ordering, if no extra vectorization happened.
+        if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
+          SmallVector<int> Mask(E.ReorderIndices.begin(),
+                                E.ReorderIndices.end());
+          reorderScalars(E.Scalars, Mask);
+          E.ReorderIndices.clear();
         }
       }
-      break;
-    }
-    case Instruction::Store: {
-      Type *ScalarTy =
-          cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
-      auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
-      Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
-      // Check if profitable to represent consecutive load + reverse as strided
-      // load with stride -1.
-      if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
-          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
-        SmallVector<int> Mask;
-        inversePermutation(E.ReorderIndices, Mask);
-        auto *BaseSI = cast<StoreInst>(E.Scalars.back());
-        InstructionCost OriginalVecCost =
-            TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
-                                 BaseSI->getPointerAddressSpace(), CostKind,
-                                 TTI::OperandValueInfo()) +
-            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
-        InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
-            MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
-                                       VecTy, BaseSI->getPointerOperand(),
-                                       /*VariableMask=*/false, CommonAlignment,
-                                       BaseSI),
-            CostKind);
-        if (StridedCost < OriginalVecCost)
-          // Strided store is more profitable than reverse + consecutive store -
-          // transform the node to strided store.
-          E.State = TreeEntry::StridedVectorize;
-      } else if (!E.ReorderIndices.empty()) {
-        // Check for interleaved stores.
-        auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
-          auto *BaseSI = cast<StoreInst>(E.Scalars.front());
-          assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
-          if (Mask.size() < 4)
-            return 0u;
-          for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
-            if (ShuffleVectorInst::isInterleaveMask(
-                    Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
-                TTI.isLegalInterleavedAccessType(
-                    VecTy, Factor, BaseSI->getAlign(),
-                    BaseSI->getPointerAddressSpace()))
-              return Factor;
+      if (!E.hasState())
+        continue;
+      switch (E.getOpcode()) {
+      case Instruction::Load: {
+        // No need to reorder masked gather loads, just reorder the scalar
+        // operands.
+        if (E.State != TreeEntry::Vectorize)
+          break;
+        Type *ScalarTy = E.getMainOp()->getType();
+        auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
+        Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+        // Check if profitable to represent consecutive load + reverse as
+        // strided load with stride -1.
+        if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
+            TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+          SmallVector<int> Mask;
+          inversePermutation(E.ReorderIndices, Mask);
+          auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+          InstructionCost OriginalVecCost =
+              TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+                                   BaseLI->getPointerAddressSpace(), CostKind,
+                                   TTI::OperandValueInfo()) +
+              ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+          InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
+              MemIntrinsicCostAttributes(
+                  Intrinsic::experimental_vp_strided_load, VecTy,
+                  BaseLI->getPointerOperand(),
+                  /*VariableMask=*/false, CommonAlignment, BaseLI),
+              CostKind);
+          if (StridedCost < OriginalVecCost || ForceStridedLoads) {
+            // Strided load is more profitable than consecutive load + reverse -
+            // transform the node to strided load.
+            Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                  ->getPointerOperand()
+                                                  ->getType());
+            StridedPtrInfo SPtrInfo;
+            SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+            SPtrInfo.Ty = VecTy;
+            TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
+            E.State = TreeEntry::StridedVectorize;
           }
-
-          return 0u;
-        };
-        SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
-        unsigned InterleaveFactor = IsInterleaveMask(Mask);
-        if (InterleaveFactor != 0)
-          E.setInterleave(InterleaveFactor);
+        }
+        break;
       }
-      break;
-    }
-    case Instruction::Select: {
-      if (E.State != TreeEntry::Vectorize)
+      case Instruction::Store: {
+        Type *ScalarTy =
+            cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
+        auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
+        Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
+        // Check if profitable to represent consecutive load + reverse as
+        // strided load with stride -1.
+        if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
+            TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+          SmallVector<int> Mask;
+          inversePermutation(E.ReorderIndices, Mask);
+          auto *BaseSI = cast<StoreInst>(E.Scalars.back());
+          InstructionCost OriginalVecCost =
+              TTI->getMemoryOpCost(Instruction::Store, VecTy,
+                                   BaseSI->getAlign(),
+                                   BaseSI->getPointerAddressSpace(), CostKind,
+                                   TTI::OperandValueInfo()) +
+              ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+          InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
+              MemIntrinsicCostAttributes(
+                  Intrinsic::experimental_vp_strided_store, VecTy,
+                  BaseSI->getPointerOperand(),
+                  /*VariableMask=*/false, CommonAlignment, BaseSI),
+              CostKind);
+          if (StridedCost < OriginalVecCost)
+            // Strided store is more profitable than reverse + consecutive store
+            // - transform the node to strided store.
+            E.State = TreeEntry::StridedVectorize;
+        } else if (!E.ReorderIndices.empty()) {
+          // Check for interleaved stores.
+          auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
+            auto *BaseSI = cast<StoreInst>(E.Scalars.front());
+            assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
+            if (Mask.size() < 4)
+              return 0u;
+            for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
+              if (ShuffleVectorInst::isInterleaveMask(
+                      Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
+                  TTI.isLegalInterleavedAccessType(
+                      VecTy, Factor, BaseSI->getAlign(),
+                      BaseSI->getPointerAddressSpace()))
+                return Factor;
+            }
+
+            return 0u;
+          };
+          SmallVector<int> Mask(E.ReorderIndices.begin(),
+                                E.ReorderIndices.end());
+          unsigned InterleaveFactor = IsInterleaveMask(Mask);
+          if (InterleaveFactor != 0)
+            E.setInterleave(InterleaveFactor);
+        }
         break;
-      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
-      if (MinMaxID == Intrinsic::not_intrinsic)
+      }
+      case Instruction::Select: {
+        if (E.State != TreeEntry::Vectorize)
+          break;
+        auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
+        if (MinMaxID == Intrinsic::not_intrinsic)
+          break;
+        // This node is a minmax node.
+        E.CombinedOp = TreeEntry::MinMax;
+        TreeEntry *CondEntry = getOperandEntry(&E, 0);
+        if (SelectOnly && CondEntry->UserTreeIndex &&
+            CondEntry->State == TreeEntry::Vectorize) {
+          // The condition node is part of the combined minmax node.
+          CondEntry->State = TreeEntry::CombinedVectorize;
+        }
         break;
-      // This node is a minmax node.
-      E.CombinedOp = TreeEntry::MinMax;
-      TreeEntry *CondEntry = getOperandEntry(&E, 0);
-      if (SelectOnly && CondEntry->UserTreeIndex &&
-          CondEntry->State == TreeEntry::Vectorize) {
-        // The condition node is part of the combined minmax node.
-        CondEntry->State = TreeEntry::CombinedVectorize;
       }
-      break;
-    }
-    case Instruction::FSub:
-    case Instruction::FAdd: {
-      // Check if possible to convert (a*b)+c to fma.
-      if (E.State != TreeEntry::Vectorize ||
-          !E.getOperations().isAddSubLikeOp())
+      case Instruction::FSub:
+      case Instruction::FAdd: {
+        // Check if possible to convert (a*b)+c to fma.
+        if (E.State != TreeEntry::Vectorize ||
+            !E.getOperations().isAddSubLikeOp())
+          break;
+        if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+                 .isValid())
+          break;
+        // This node is a fmuladd node.
+        E.CombinedOp = TreeEntry::FMulAdd;
+        TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+        if (FMulEntry->UserTreeIndex &&
+            FMulEntry->State == TreeEntry::Vectorize) {
+          // The FMul node is part of the combined fmuladd node.
+          FMulEntry->State = TreeEntry::CombinedVectorize;
+        }
         break;
-      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
-               .isValid())
+      }
+      default:
         break;
-      // This node is a fmuladd node.
-      E.CombinedOp = TreeEntry::FMulAdd;
-      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
-      if (FMulEntry->UserTreeIndex &&
-          FMulEntry->State == TreeEntry::Vectorize) {
-        // The FMul node is part of the combined fmuladd node.
-        FMulEntry->State = TreeEntry::CombinedVectorize;
       }
-      break;
-    }
-    default:
-      break;
     }
-  }
 
-  if (LoadEntriesToVectorize.empty()) {
-    // Single load node - exit.
-    if (VT.size() <= 1 && VT.front()->hasState() &&
-        VT.front()->getOpcode() == Instruction::Load)
-      return false;
-    // Small graph with small VF - exit.
-    constexpr unsigned SmallTree = 3;
-    constexpr unsigned SmallVF = 2;
-    if ((VT.size() <= SmallTree &&
-         VT.front()->Scalars.size() == SmallVF) ||
-        (VT.size() <= 2 && UserIgnoreList))
-      return false;
+    if (LoadEntriesToVectorize.empty()) {
+      // Single load node - exit.
+      if (VT.size() <= 1 && VT.front()->hasState() &&
+          VT.front()->getOpcode() == Instruction::Load)
+        return false;
+      // Small graph with small VF - exit.
+      constexpr unsigned SmallTree = 3;
+      constexpr unsigned SmallVF = 2;
+      if ((VT.size() <= SmallTree && VT.front()->Scalars.size() == SmallVF) ||
+          (VT.size() <= 2 && UserIgnoreList))
+        return false;
 
-    if (VT.front()->isNonPowOf2Vec() &&
-        getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
-        getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
-                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() && TE->hasState() &&
-                          TE->getOpcode() == Instruction::Load &&
-                          !allSameBlock(TE->Scalars);
-                 }) == 1)
-      return false;
-  }
-  return true;
+      if (VT.front()->isNonPowOf2Vec() &&
+          getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
+          getCanonicalGraphSize() <= SmallTree &&
+          count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
+                   [](const std::unique_ptr<TreeEntry> &TE) {
+                     return TE->isGather() && TE->hasState() &&
+                            TE->getOpcode() == Instruction::Load &&
+                            !allSameBlock(TE->Scalars);
+                   }) == 1)
+        return false;
+    }
+    return true;
   };
 
   class GraphTransformModeRAAI {
@@ -13530,31 +13537,33 @@ void BoUpSLP::transformNodes() {
 
   for (auto &VT : VectorizableTree) {
     for (std::unique_ptr<TreeEntry> &TE : VT) {
-    TreeEntry &E = *TE;
-    if (E.isGather() &&
-        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
-         (!E.hasState() && any_of(E.Scalars,
-                                  [&](Value *V) {
-                                    return isa<LoadInst>(V) &&
-                                           !isVectorized(V) &&
-                                           !isDeleted(cast<Instruction>(V));
-                                  }))) &&
-        !isSplat(E.Scalars)) {
-      for (Value *V : E.Scalars) {
-        auto *LI = dyn_cast<LoadInst>(V);
-        if (!LI)
-          continue;
-        if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
-          continue;
-        gatherPossiblyVectorizableLoads(
-            *this, V, *DL, *SE, *TTI,
-            GatheredLoads[std::make_tuple(
-                LI->getParent(),
-                getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
-                LI->getType())]);
+      TreeEntry &E = *TE;
+      if (E.isGather() &&
+          ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+           (!E.hasState() && any_of(E.Scalars,
+                                    [&](Value *V) {
+                                      return isa<LoadInst>(V) &&
+                                             !isVectorized(V) &&
+                                             !isDeleted(cast<Instruction>(V));
+                                    }))) &&
+          !isSplat(E.Scalars)) {
+        for (Value *V : E.Scalars) {
+          auto *LI = dyn_cast<LoadInst>(V);
+          if (!LI)
+            continue;
+          if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
+            continue;
+          gatherPossiblyVectorizableLoads(
+              *this, V, *DL, *SE, *TTI,
+              GatheredLoads[std::make_tuple(
+                  LI->getParent(),
+                  getUnderlyingObject(LI->getPointerOperand(),
+                                      RecursionMaxDepth),
+                  LI->getType())]);
+        }
       }
     }
-  }}
+  }
   // Try to vectorize gathered loads if this is not just a gather of loads.
   if (!GatheredLoads.empty())
     tryToVectorizeGatheredLoads(GatheredLoads);

>From f6a41be3a6d5fab6a8c0619a09566532213c8852 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:32:08 -0800
Subject: [PATCH 18/19] [SLP] Update test.

---
 .../Transforms/SLPVectorizer/RISCV/wide-stores.ll     | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
index ab5befb17cb1c..80a1c8644f086 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
@@ -5,16 +5,13 @@ define dso_local void @wide_gather(ptr noalias noundef writeonly captures(none)
 ; CHECK-LABEL: define dso_local void @wide_gather(
 ; CHECK-SAME: ptr noalias noundef writeonly captures(none) initializes((0, 64)) [[X:%.*]], ptr noalias noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[Y]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 0, i64 48, i64 8, i64 16, i64 112, i64 24, i64 56, i64 64>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 40, i64 72, i64 80, i64 88, i64 120, i64 104, i64 32, i64 96>
 ; CHECK-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 64
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP6]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[Y]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i64> [[TMP0]], <16 x i64> poison, <8 x i32> <i32 0, i32 6, i32 1, i32 2, i32 14, i32 3, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <8 x i64> [[TMP1]], splat (i64 1)
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP7]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i64> [[TMP0]], <16 x i64> poison, <8 x i32> <i32 5, i32 9, i32 10, i32 11, i32 15, i32 13, i32 4, i32 12>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
 ; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARRAYIDX2_8]], align 8, !tbaa [[LONG_TBAA0]]
 ; CHECK-NEXT:    ret void
 ;

>From 1892832a03dfe5e07bc1e926a259ce81c446f4af Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:42:47 -0800
Subject: [PATCH 19/19] [SLP] Add TODO

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0239ecd857ff7..0230887e8b74e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22746,6 +22746,7 @@ bool BoUpSLP::collectValuesToDemote(
 
 static RecurKind getRdxKind(Value *V);
 
+// TODO: Handle forest of trees
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
   bool IsStoreOrInsertElt =



More information about the llvm-commits mailing list