[llvm] [SLP][NFC] Redesign schedule bundle, separate from schedule data, NFC (PR #131625)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 20 05:28:48 PDT 2025


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/131625

>From c13a04fcda0e59a9f62ce2043c390017e13d4f7e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 17 Mar 2025 15:12:56 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 1145 ++++++++++-------
 1 file changed, 676 insertions(+), 469 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d450336cbc3ce..6766b68841a9a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1368,7 +1368,9 @@ namespace slpvectorizer {
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
   struct TreeEntry;
-  struct ScheduleData;
+  class ScheduleEntity;
+  class ScheduleData;
+  class ScheduleBundle;
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
@@ -1433,7 +1435,8 @@ class BoUpSLP {
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
+  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
+                              InstructionCost ReductionCost = TTI::TCC_Free);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -3670,8 +3673,7 @@ class BoUpSLP {
 #endif
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
-                          std::optional<ScheduleData *> Bundle,
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = {},
@@ -3688,8 +3690,7 @@ class BoUpSLP {
 
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
                           TreeEntry::EntryState EntryState,
-                          std::optional<ScheduleData *> Bundle,
-                          const InstructionsState &S,
+                          ScheduleBundle &Bundle, const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = {},
                           ArrayRef<unsigned> ReorderIndices = {}) {
@@ -3781,22 +3782,23 @@ class BoUpSLP {
         }
       }
       // Update the scheduler bundle to point to this TreeEntry.
-      ScheduleData *BundleMember = *Bundle;
-      assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
+      assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
               isVectorLikeInstWithConstOps(S.getMainOp()) ||
               doesNotNeedToSchedule(VL)) &&
              "Bundle and VL out of sync");
-      if (BundleMember) {
+      if (!Bundle.getBundle().empty()) {
+        auto *BundleMember = Bundle.getBundle().begin();
         for (Value *V : VL) {
           if (doesNotNeedToBeScheduled(V))
             continue;
-          if (!BundleMember)
+          if (BundleMember == Bundle.getBundle().end())
             continue;
-          BundleMember->TE = Last;
-          BundleMember = BundleMember->NextInBundle;
+          ++BundleMember;
         }
+        assert(BundleMember == Bundle.getBundle().end() &&
+               "Bundle and VL out of sync");
+        Bundle.setTreeEntry(Last);
       }
-      assert(!BundleMember && "Bundle and VL out of sync");
     } else {
       // Build a map for gathered scalars to the nodes where they are used.
       bool AllConstsOrCasts = true;
@@ -3946,16 +3948,17 @@ class BoUpSLP {
   /// is invariant in the calling loop.
   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
                  Instruction *Inst2) {
-    if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
+    assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instrucgion.");
+    if (!isSimple(Inst2))
       return true;
     // First check if the result is already in the cache.
     AliasCacheKey Key = std::make_pair(Inst1, Inst2);
-    auto It = AliasCache.find(Key);
-    if (It != AliasCache.end())
-      return It->second;
+    auto Res = AliasCache.try_emplace(Key);
+    if (!Res.second)
+      return Res.first->second;
     bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
     // Store the result in the cache.
-    AliasCache.try_emplace(Key, Aliased);
+    Res.first->getSecond() = Aliased;
     AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
     return Aliased;
   }
@@ -3964,7 +3967,7 @@ class BoUpSLP {
 
   /// Cache for alias results.
   /// TODO: consider moving this to the AliasAnalysis itself.
-  DenseMap<AliasCacheKey, bool> AliasCache;
+  SmallDenseMap<AliasCacheKey, bool> AliasCache;
 
   // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
   // globally through SLP because we don't perform any action which
@@ -4011,26 +4014,55 @@ class BoUpSLP {
   /// List of hashes of vector of loads, which are known to be non vectorizable.
   DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
 
+  class ScheduleEntity {
+    friend class ScheduleBundle;
+    friend class ScheduleData;
+
+  protected:
+    enum class Kind { ScheduleData, ScheduleBundle };
+    Kind getKind() const { return K; }
+    ScheduleEntity(Kind K) : K(K) {}
+
+  private:
+    /// Used for getting a "good" final ordering of instructions.
+    int SchedulingPriority = 0;
+    /// The kind of the ScheduleEntity.
+    Kind K = Kind::ScheduleData;
+
+  public:
+    ScheduleEntity() = delete;
+    /// Gets/sets the scheduling priority.
+    void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
+    int getSchedulingPriority() const { return SchedulingPriority; }
+    bool isReady() const {
+      if (auto *SD = dyn_cast<ScheduleData>(this))
+        return SD->isReady();
+      return cast<ScheduleBundle>(this)->isReady();
+    }
+    static bool classof(const ScheduleEntity *) { return true; }
+  };
+
   /// Contains all scheduling relevant data for an instruction.
   /// A ScheduleData either represents a single instruction or a member of an
   /// instruction bundle (= a group of instructions which is combined into a
   /// vector instruction).
-  struct ScheduleData {
+  class ScheduleData final : public ScheduleEntity {
+  public:
     // The initial value for the dependency counters. It means that the
     // dependencies are not calculated yet.
     enum { InvalidDeps = -1 };
 
-    ScheduleData() = default;
+    ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
+    static bool classof(const ScheduleEntity *Entity) {
+      return Entity->getKind() == Kind::ScheduleData;
+    }
 
     void init(int BlockSchedulingRegionID, Instruction *I) {
-      FirstInBundle = this;
-      NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
       SchedulingRegionID = BlockSchedulingRegionID;
       clearDependencies();
       Inst = I;
-      TE = nullptr;
     }
 
     /// Verify basic self consistency properties
@@ -4042,20 +4074,9 @@ class BoUpSLP {
       }
 
       if (IsScheduled) {
-        assert(isSchedulingEntity() &&
-                "unexpected scheduled state");
-        for (const ScheduleData *BundleMember = this; BundleMember;
-             BundleMember = BundleMember->NextInBundle) {
-          assert(BundleMember->hasValidDependencies() &&
-                 BundleMember->UnscheduledDeps == 0 &&
-                 "unexpected scheduled state");
-          assert((BundleMember == this || !BundleMember->IsScheduled) &&
-                 "only bundle is marked scheduled");
-        }
+        assert(hasValidDependencies() && UnscheduledDeps == 0 &&
+               "unexpected scheduled state");
       }
-
-      assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
-             "all bundle members must be in same basic block");
     }
 
     /// Returns true if the dependency information has been calculated.
@@ -4063,23 +4084,9 @@ class BoUpSLP {
     /// a single bundle.
     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 
-    /// Returns true for single instructions and for bundle representatives
-    /// (= the head of a bundle).
-    bool isSchedulingEntity() const { return FirstInBundle == this; }
-
-    /// Returns true if it represents an instruction bundle and not only a
-    /// single instruction.
-    bool isPartOfBundle() const {
-      return NextInBundle != nullptr || FirstInBundle != this || TE;
-    }
-
     /// Returns true if it is ready for scheduling, i.e. it has no more
     /// unscheduled depending instructions/bundles.
-    bool isReady() const {
-      assert(isSchedulingEntity() &&
-             "can't consider non-scheduling entity for ready list");
-      return unscheduledDepsInBundle() == 0 && !IsScheduled;
-    }
+    bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
 
     /// Modifies the number of unscheduled dependencies for this instruction,
     /// and returns the number of remaining dependencies for the containing
@@ -4088,14 +4095,12 @@ class BoUpSLP {
       assert(hasValidDependencies() &&
              "increment of unscheduled deps would be meaningless");
       UnscheduledDeps += Incr;
-      return FirstInBundle->unscheduledDepsInBundle();
+      return UnscheduledDeps;
     }
 
     /// Sets the number of unscheduled dependencies to the number of
     /// dependencies.
-    void resetUnscheduledDeps() {
-      UnscheduledDeps = Dependencies;
-    }
+    void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
 
     /// Clears all dependency information.
     void clearDependencies() {
@@ -4103,78 +4108,76 @@ class BoUpSLP {
       resetUnscheduledDeps();
       MemoryDependencies.clear();
       ControlDependencies.clear();
+      IsScheduled = false;
     }
 
-    int unscheduledDepsInBundle() const {
-      assert(isSchedulingEntity() && "only meaningful on the bundle");
-      int Sum = 0;
-      for (const ScheduleData *BundleMember = this; BundleMember;
-           BundleMember = BundleMember->NextInBundle) {
-        if (BundleMember->UnscheduledDeps == InvalidDeps)
-          return InvalidDeps;
-        Sum += BundleMember->UnscheduledDeps;
-      }
-      return Sum;
-    }
+    /// Gets/sets if the bundle is scheduled.
+    bool isScheduled() const { return IsScheduled; }
+    void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
 
-    void dump(raw_ostream &OS) const {
-      if (isPartOfBundle()) {
-        if (!isSchedulingEntity()) {
-          OS << "/ " << *Inst << ", part of ";
-          FirstInBundle->dump(OS);
-          return;
-        }
-        OS << '[' << *Inst;
-        ScheduleData *SD = NextInBundle;
-        while (SD) {
-          OS << ';' << *SD->Inst;
-          SD = SD->NextInBundle;
-        }
-        OS << ']';
-      } else {
-        OS << *Inst;
-      }
+    /// Gets the number of unscheduled dependencies.
+    int getUnscheduledDeps() const { return UnscheduledDeps; }
+    /// Gets the number of dependencies.
+    int getDependencies() const { return Dependencies; }
+    /// Initializes the number of dependencies.
+    void initDependencies() { Dependencies = 0; }
+    /// Increments the number of dependencies.
+    void incDependencies() { Dependencies++; }
+
+    /// Gets scheduling region ID.
+    int getSchedulingRegionID() const { return SchedulingRegionID; }
+
+    /// Gets the instruction.
+    Instruction *getInst() const { return Inst; }
+
+    /// Gets the list of memory dependencies.
+    ArrayRef<ScheduleData *> getMemoryDependencies() const {
+      return MemoryDependencies;
+    }
+    /// Adds a memory dependency.
+    void addMemoryDependency(ScheduleData *Dep) {
+      MemoryDependencies.push_back(Dep);
+    }
+    /// Gets the list of control dependencies.
+    ArrayRef<ScheduleData *> getControlDependencies() const {
+      return ControlDependencies;
     }
+    /// Adds a control dependency.
+    void addControlDependency(ScheduleData *Dep) {
+      ControlDependencies.push_back(Dep);
+    }
+    /// Gets/sets the next load/store instruction in the block.
+    ScheduleData *getNextLoadStore() const { return NextLoadStore; }
+    void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
+
+    void dump(raw_ostream &OS) const { OS << *Inst; }
 
     LLVM_DUMP_METHOD void dump() const {
       dump(dbgs());
       dbgs() << '\n';
     }
 
+  private:
     Instruction *Inst = nullptr;
 
-    /// The TreeEntry that this instruction corresponds to.
-    TreeEntry *TE = nullptr;
-
-    /// Points to the head in an instruction bundle (and always to this for
-    /// single instructions).
-    ScheduleData *FirstInBundle = nullptr;
-
-    /// Single linked list of all instructions in a bundle. Null if it is a
-    /// single instruction.
-    ScheduleData *NextInBundle = nullptr;
-
     /// Single linked list of all memory instructions (e.g. load, store, call)
     /// in the block - until the end of the scheduling region.
     ScheduleData *NextLoadStore = nullptr;
 
     /// The dependent memory instructions.
     /// This list is derived on demand in calculateDependencies().
-    SmallVector<ScheduleData *, 4> MemoryDependencies;
+    SmallVector<ScheduleData *> MemoryDependencies;
 
     /// List of instructions which this instruction could be control dependent
     /// on.  Allowing such nodes to be scheduled below this one could introduce
     /// a runtime fault which didn't exist in the original program.
     /// ex: this is a load or udiv following a readonly call which inf loops
-    SmallVector<ScheduleData *, 4> ControlDependencies;
+    SmallVector<ScheduleData *> ControlDependencies;
 
     /// This ScheduleData is in the current scheduling region if this matches
     /// the current SchedulingRegionID of BlockScheduling.
     int SchedulingRegionID = 0;
 
-    /// Used for getting a "good" final ordering of instructions.
-    int SchedulingPriority = 0;
-
     /// The number of dependencies. Constitutes of the number of users of the
     /// instruction plus the number of dependent memory instructions (if any).
     /// This value is calculated on demand.
@@ -4200,6 +4203,112 @@ class BoUpSLP {
   }
 #endif
 
+  class ScheduleBundle final : public ScheduleEntity {
+    /// The schedule data for the instructions in the bundle.
+    SmallVector<ScheduleData *> Bundle;
+    /// True if this bundle is valid.
+    bool IsValid = true;
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+    ScheduleBundle(bool IsValid)
+        : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
+
+  public:
+    ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
+    static bool classof(const ScheduleEntity *Entity) {
+      return Entity->getKind() == Kind::ScheduleBundle;
+    }
+
+    /// Verify basic self consistency properties
+    void verify() const {
+      for (const ScheduleData *SD : Bundle) {
+        if (SD->hasValidDependencies()) {
+          assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
+                 "invariant");
+        } else {
+          assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
+                 "invariant");
+        }
+
+        if (isScheduled()) {
+          assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
+                 "unexpected scheduled state");
+        }
+      }
+    }
+
+    bool isScheduled() const {
+      return all_of(Bundle,
+                    [](const ScheduleData *SD) { return SD->isScheduled(); });
+    }
+
+    int unscheduledDepsInBundle() const {
+      assert(*this && "bundle must not be empty");
+      int Sum = 0;
+      for (const ScheduleData *BundleMember : Bundle) {
+        if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
+          return ScheduleData::InvalidDeps;
+        Sum += BundleMember->getUnscheduledDeps();
+      }
+      return Sum;
+    }
+
+    /// Returns true if the dependency information has been calculated.
+    /// Note that depenendency validity can vary between instructions within
+    /// a single bundle.
+    bool hasValidDependencies() const {
+      return all_of(Bundle, [](const ScheduleData *SD) {
+        return SD->hasValidDependencies();
+      });
+    }
+
+    /// Returns true if it is ready for scheduling, i.e. it has no more
+    /// unscheduled depending instructions/bundles.
+    bool isReady() const {
+      assert(*this && "bundle must not be empty");
+      return unscheduledDepsInBundle() == 0 && !isScheduled();
+    }
+
+    /// Returns the bundle of scheduling data, associated with the current
+    /// instruction.
+    ArrayRef<ScheduleData *> getBundle() { return Bundle; }
+    ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
+    /// Adds an instruction to the bundle.
+    void add(ScheduleData *SD) { Bundle.push_back(SD); }
+
+    /// Gets/sets the associated tree entry.
+    void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
+    TreeEntry *getTreeEntry() const { return TE; }
+
+    static ScheduleBundle invalid() { return {false}; }
+
+    operator bool() const { return IsValid; }
+
+    void dump(raw_ostream &OS) const {
+      if (!*this) {
+        OS << "[]";
+        return;
+      }
+      OS << '[';
+      interleaveComma(Bundle, OS,
+                      [&](const ScheduleData *SD) { OS << *SD->getInst(); });
+      OS << ']';
+    }
+
+    LLVM_DUMP_METHOD void dump() const {
+      dump(dbgs());
+      dbgs() << '\n';
+    }
+  };
+
+#ifndef NDEBUG
+  friend inline raw_ostream &operator<<(raw_ostream &os,
+                                        const BoUpSLP::ScheduleBundle &Bundle) {
+    Bundle.dump(os);
+    return os;
+  }
+#endif
+
   friend struct GraphTraits<BoUpSLP *>;
   friend struct DOTGraphTraits<BoUpSLP *>;
 
@@ -4224,6 +4333,8 @@ class BoUpSLP {
         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
 
     void clear() {
+      ScheduledBundles.clear();
+      ScheduledBundlesList.clear();
       ReadyInsts.clear();
       ScheduleStart = nullptr;
       ScheduleEnd = nullptr;
@@ -4244,6 +4355,8 @@ class BoUpSLP {
     }
 
     ScheduleData *getScheduleData(Instruction *I) {
+      if (!I)
+        return nullptr;
       if (BB != I->getParent())
         // Avoid lookup if can't possibly be in map.
         return nullptr;
@@ -4254,52 +4367,78 @@ class BoUpSLP {
     }
 
     ScheduleData *getScheduleData(Value *V) {
-      if (auto *I = dyn_cast<Instruction>(V))
-        return getScheduleData(I);
-      return nullptr;
+      return getScheduleData(dyn_cast<Instruction>(V));
+    }
+
+    ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        return {};
+      auto It = ScheduledBundles.find(I);
+      if (It == ScheduledBundles.end())
+        return {};
+      return It->getSecond();
     }
 
     bool isInSchedulingRegion(ScheduleData *SD) const {
-      return SD->SchedulingRegionID == SchedulingRegionID;
+      return SD->getSchedulingRegionID() == SchedulingRegionID;
+    }
+
+    bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
+      return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) {
+        return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
+      });
     }
 
     /// Marks an instruction as scheduled and puts all dependent ready
     /// instructions into the ready-list.
     template <typename ReadyListType>
-    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
-      SD->IsScheduled = true;
-      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
-
-      for (ScheduleData *BundleMember = SD; BundleMember;
-           BundleMember = BundleMember->NextInBundle) {
-
+    void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
+      auto ProcessBundleMember = [&](ScheduleData *BundleMember,
+                                     ScheduleBundle *Bundle) {
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
-        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
-          ScheduleData *OpDef = getScheduleData(I);
-          if (OpDef && OpDef->hasValidDependencies() &&
-              OpDef->incrementUnscheduledDeps(-1) == 0) {
+        auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
+          if ((IsControl || Data->hasValidDependencies()) &&
+              Data->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after
             // decrementing, so we can put the dependent instruction
             // into the ready list.
-            ScheduleData *DepBundle = OpDef->FirstInBundle;
-            assert(!DepBundle->IsScheduled &&
+            if (ArrayRef<ScheduleBundle *> Bundles =
+                    getScheduleBundles(Data->getInst());
+                !Bundles.empty()) {
+              for (ScheduleBundle *Bundle : Bundles) {
+                if (Bundle->unscheduledDepsInBundle() == 0) {
+                  assert(!Bundle->isScheduled() &&
+                         "already scheduled bundle gets ready");
+                  ReadyList.insert(Bundle);
+                  LLVM_DEBUG(dbgs()
+                             << "SLP:    gets ready: " << *Bundle << "\n");
+                }
+              }
+              return;
+            }
+            assert(!Data->isScheduled() &&
                    "already scheduled bundle gets ready");
-            ReadyList.insert(DepBundle);
-            LLVM_DEBUG(dbgs()
-                       << "SLP:    gets ready (def): " << *DepBundle << "\n");
+            ReadyList.insert(Data);
+            LLVM_DEBUG(dbgs() << "SLP:    gets ready: " << *Data << "\n");
           }
         };
 
+        auto DecrUnschedForInst = [&](Instruction *I) {
+          if (ScheduleData *OpSD = getScheduleData(I))
+            DecrUnsched(OpSD, /*IsControl*/false);
+        };
+
         // If BundleMember is a vector bundle, its operands may have been
         // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
-        if (TreeEntry *TE = BundleMember->TE) {
+        if (Bundle) {
           // Need to search for the lane since the tree entry can be reordered.
-          auto *In = BundleMember->Inst;
-          int Lane = std::distance(TE->Scalars.begin(),
-                                   find(TE->Scalars, In));
+          auto *In = BundleMember->getInst();
+          int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
+                                   find(Bundle->getTreeEntry()->Scalars, In));
           assert(Lane >= 0 && "Lane not set");
 
           // Since vectorization tree is being built recursively this assertion
@@ -4311,46 +4450,58 @@ class BoUpSLP {
           assert(
               In &&
               (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
-               In->getNumOperands() == TE->getNumOperands()) &&
+               In->getNumOperands() ==
+                   Bundle->getTreeEntry()->getNumOperands()) &&
               "Missed TreeEntry operands?");
 
-          for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
-            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
-              DecrUnsched(I);
+          for (unsigned OpIdx :
+               seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+            if (auto *I = dyn_cast<Instruction>(
+                    Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+              LLVM_DEBUG(dbgs()
+                         << "SLP:   check for readiness (def): " << *I << "\n");
+              DecrUnschedForInst(I);
+            }
         } else {
           // If BundleMember is a stand-alone instruction, no operand reordering
           // has taken place, so we directly access its operands.
-          for (Use &U : BundleMember->Inst->operands())
-            if (auto *I = dyn_cast<Instruction>(U.get()))
-              DecrUnsched(I);
+          for (Use &U : BundleMember->getInst()->operands())
+            if (auto *I = dyn_cast<Instruction>(U.get())) {
+              LLVM_DEBUG(dbgs()
+                         << "SLP:   check for readiness (def): " << *I << "\n");
+              DecrUnschedForInst(I);
+            }
         }
         // Handle the memory dependencies.
-        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
-          if (MemoryDepSD->hasValidDependencies() &&
-              MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
-            // There are no more unscheduled dependencies after decrementing,
-            // so we can put the dependent instruction into the ready list.
-            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
-            assert(!DepBundle->IsScheduled &&
-                   "already scheduled bundle gets ready");
-            ReadyList.insert(DepBundle);
-            LLVM_DEBUG(dbgs()
-                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
-          }
+        for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
+          // There are no more unscheduled dependencies after decrementing,
+          // so we can put the dependent instruction into the ready list.
+          LLVM_DEBUG(dbgs()
+                     << "SLP:   check for readiness (mem): " << *MemoryDep
+                     << "\n");
+          DecrUnsched(MemoryDep);
         }
         // Handle the control dependencies.
-        for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
-          if (DepSD->incrementUnscheduledDeps(-1) == 0) {
-            // There are no more unscheduled dependencies after decrementing,
-            // so we can put the dependent instruction into the ready list.
-            ScheduleData *DepBundle = DepSD->FirstInBundle;
-            assert(!DepBundle->IsScheduled &&
-                   "already scheduled bundle gets ready");
-            ReadyList.insert(DepBundle);
-            LLVM_DEBUG(dbgs()
-                       << "SLP:    gets ready (ctl): " << *DepBundle << "\n");
-          }
+        for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
+          // There are no more unscheduled dependencies after decrementing,
+          // so we can put the dependent instruction into the ready list.
+          LLVM_DEBUG(dbgs()
+                     << "SLP:   check for readiness (ctrl): " << *Dep << "\n");
+          DecrUnsched(Dep, /*IsControl=*/true);
         }
+      };
+      if (auto *SD = dyn_cast<ScheduleData>(Data)) {
+        SD->setScheduled(/*Scheduled=*/true);
+        LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
+        ProcessBundleMember(SD, nullptr);
+      } else {
+        ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
+        for_each(Bundle.getBundle(), [](ScheduleData *SD) {
+          SD->setScheduled(/*Scheduled=*/true);
+        });
+        LLVM_DEBUG(dbgs() << "SLP:   schedule " << Bundle << "\n");
+        for (ScheduleData *SD : Bundle.getBundle())
+          ProcessBundleMember(SD, &Bundle);
       }
     }
 
@@ -4364,30 +4515,49 @@ class BoUpSLP {
              "Not a valid scheduling region?");
 
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
+        if (!Bundles.empty()) {
+          for (ScheduleBundle *Bundle : Bundles) {
+            assert(isInSchedulingRegion(*Bundle) &&
+                   "primary schedule data not in window?");
+            Bundle->verify();
+          }
+          continue;
+        }
         auto *SD = getScheduleData(I);
         if (!SD)
           continue;
         assert(isInSchedulingRegion(SD) &&
                "primary schedule data not in window?");
-        assert(isInSchedulingRegion(SD->FirstInBundle) &&
-               "entire bundle in window!");
         SD->verify();
       }
 
-      for (auto *SD : ReadyInsts) {
-        assert(SD->isSchedulingEntity() && SD->isReady() &&
-               "item in ready list not ready?");
-        (void)SD;
+      for (const ScheduleEntity *Bundle : ReadyInsts) {
+        assert(Bundle->isReady() && "item in ready list not ready?");
+        (void)Bundle;
       }
     }
 
     /// Put all instructions into the ReadyList which are ready for scheduling.
     template <typename ReadyListType>
     void initialFillReadyList(ReadyListType &ReadyList) {
+      SmallPtrSet<ScheduleBundle *, 16> Visited;
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         ScheduleData *SD = getScheduleData(I);
-        if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
-            SD->isReady()) {
+        if (SD && SD->hasValidDependencies() && SD->isReady()) {
+          if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
+              !Bundles.empty()) {
+            for (ScheduleBundle *Bundle : Bundles) {
+              if (!Visited.insert(Bundle).second)
+                continue;
+              if (Bundle->hasValidDependencies() && Bundle->isReady()) {
+                ReadyList.insert(Bundle);
+                LLVM_DEBUG(dbgs() << "SLP:    initially in ready list: "
+                                  << *Bundle << "\n");
+              }
+            }
+            continue;
+          }
           ReadyList.insert(SD);
           LLVM_DEBUG(dbgs()
                      << "SLP:    initially in ready list: " << *SD << "\n");
@@ -4397,20 +4567,17 @@ class BoUpSLP {
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
-    ScheduleData *buildBundle(ArrayRef<Value *> VL);
+    ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
     /// \returns the scheduling bundle. The returned Optional value is not
     /// std::nullopt if \p VL is allowed to be scheduled.
-    std::optional<ScheduleData *>
+    std::optional<ScheduleBundle *>
     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                       const InstructionsState &S);
 
-    /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
-
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
 
@@ -4426,7 +4593,7 @@ class BoUpSLP {
 
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
-    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+    void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
                                BoUpSLP *SLP);
 
     /// Sets all instruction in the scheduling region to un-scheduled.
@@ -4447,10 +4614,16 @@ class BoUpSLP {
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
     /// ScheduleData structures are recycled.
-    DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+    SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+
+    /// Attaches ScheduleBundle to Instruction.
+    SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
+        ScheduledBundles;
+    /// The list of ScheduleBundles.
+    SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
 
     /// The ready-list for scheduling (only used for the dry-run).
-    SetVector<ScheduleData *> ReadyInsts;
+    SetVector<ScheduleEntity *> ReadyInsts;
 
     /// The first instruction of the scheduling region.
     Instruction *ScheduleStart = nullptr;
@@ -7836,24 +8009,6 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     GatheredLoadsEntriesFirst.reset();
 }
 
-/// \return true if the specified list of values has only one instruction that
-/// requires scheduling, false otherwise.
-#ifndef NDEBUG
-static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
-  Value *NeedsScheduling = nullptr;
-  for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V))
-      continue;
-    if (!NeedsScheduling) {
-      NeedsScheduling = V;
-      continue;
-    }
-    return false;
-  }
-  return NeedsScheduling;
-}
-#endif
-
 /// Generates key/subkey pair for the given value to provide effective sorting
 /// of the values and better detection of the vectorizable values sequences. The
 /// keys/subkeys can be used for better sorting of the values themselves (keys)
@@ -8675,7 +8830,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
         return false;
       }
       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
@@ -8700,7 +8856,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             // vectorization (div/rem are not allowed).
             if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
               LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-              newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+              auto Invalid = ScheduleBundle::invalid();
+              newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
               return false;
             }
             VL = NonUniqueValueVL;
@@ -8708,7 +8865,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           return true;
         }
         LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
         return false;
       }
       VL = UniqueValues;
@@ -8723,7 +8881,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // place to insert a shuffle if we need to, so just avoid that issue.
   if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
     LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
-    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -8734,9 +8893,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (E->isSame(VL)) {
         LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
                           << ".\n");
-        if (TryToFindDuplicates(S))
-          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+        if (TryToFindDuplicates(S)) {
+          auto Invalid = ScheduleBundle::invalid();
+          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                        ReuseShuffleIndices);
+        }
         return;
       }
       SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end());
@@ -8744,9 +8905,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             return isa<PoisonValue>(V) || Values.contains(V);
           })) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
-        if (TryToFindDuplicates(S))
-          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+        if (TryToFindDuplicates(S)) {
+          auto Invalid = ScheduleBundle::invalid();
+          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                        ReuseShuffleIndices);
+        }
         return;
       }
     }
@@ -8764,9 +8927,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                   cast<Instruction>(I)->getOpcode() == S.getOpcode();
          })))) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    if (TryToFindDuplicates(S))
-      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+    if (TryToFindDuplicates(S)) {
+      auto Invalid = ScheduleBundle::invalid();
+      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndices);
+    }
     return;
   }
 
@@ -8775,16 +8940,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       isa<ScalableVectorType>(
           cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
-    if (TryToFindDuplicates(S))
-      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+    if (TryToFindDuplicates(S)) {
+      auto Invalid = ScheduleBundle::invalid();
+      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndices);
+    }
     return;
   }
 
   // Don't handle vectors.
   if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -8800,9 +8968,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (any_of(VL, [&](Value *V) {
           return ScalarsInSplitNodes.contains(V) || isVectorized(V);
         })) {
-      if (TryToFindDuplicates(S))
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+      if (TryToFindDuplicates(S)) {
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndices);
+      }
       return true;
     }
     SmallVector<Value *> Op1, Op2;
@@ -8905,7 +9075,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     SmallVector<Value *> NewVL(VL.size());
     copy(Op1, NewVL.begin());
     copy(Op2, std::next(NewVL.begin(), Op1.size()));
-    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt,
+    auto Invalid = ScheduleBundle::invalid();
+    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid,
                             LocalState, UserTreeIdx, {}, ReorderIndices);
     LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
     auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
@@ -8915,7 +9086,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Build gather node for loads, they will be gathered later.
         TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
                                                     Idx == 0 ? 0 : Op1.size());
-        (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, S,
+        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S,
                            {TE, Idx});
       } else {
         TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
@@ -9021,9 +9192,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         return;
     }
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
-    if (TryToFindDuplicates(S))
-      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+    if (TryToFindDuplicates(S)) {
+      auto Invalid = ScheduleBundle::invalid();
+      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndices);
+    }
     return;
   }
 
@@ -9033,7 +9206,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (EphValues.count(V)) {
         LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                           << ") is ephemeral.\n");
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
         return;
       }
     }
@@ -9050,9 +9224,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (isVectorized(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
-      if (TryToFindDuplicates(S))
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+      if (TryToFindDuplicates(S)){
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndices);
+      }
       return;
     }
   }
@@ -9062,9 +9238,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     for (Value *V : VL) {
       if (UserIgnoreList->contains(V)) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-        if (TryToFindDuplicates(S))
-          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+        if (TryToFindDuplicates(S)) {
+          auto Invalid = ScheduleBundle::invalid();
+          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                        ReuseShuffleIndices);
+        }
         return;
       }
     }
@@ -9095,7 +9273,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Do not vectorize EH and non-returning blocks, not profitable in most
     // cases.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -9113,7 +9292,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
   if (State == TreeEntry::NeedToGather) {
-    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     return;
   }
@@ -9124,22 +9304,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   BlockScheduling &BS = *BSRef;
 
-  std::optional<ScheduleData *> Bundle =
+  std::optional<ScheduleBundle *> BundlePtr =
       BS.tryScheduleBundle(UniqueValues, this, S);
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
 #endif
-  if (!Bundle) {
+  if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
-           "tryScheduleBundle should cancelScheduling on failure");
+    assert((!BS.getScheduleData(VL0) || BS.getScheduleBundles(VL0).empty()) &&
+           "tryScheduleBundle should not create bundle on failure");
     // Last chance to try to vectorize alternate node.
     if (S.isAltShuffle() && ReuseShuffleIndices.empty() &&
         TrySplitNode(SmallNodeSize, S))
       return;
-    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     NonScheduledFirst.insert(VL.front());
     if (S.getOpcode() == Instruction::Load &&
@@ -9147,6 +9327,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       registerNonVectorizableLoads(VL);
     return;
   }
+  ScheduleBundle Empty;
+  ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
   unsigned ShuffleOrOp =
@@ -13351,8 +13533,9 @@ template <typename T> struct ShuffledInsertData {
 };
 } // namespace
 
-InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
-  InstructionCost Cost = 0;
+InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
+                                     InstructionCost ReductionCost) {
+  InstructionCost Cost = ReductionCost;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
@@ -13396,6 +13579,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                       << "SLP: Current total cost = " << Cost << "\n");
   }
 
+  if (Cost >= -SLPCostThreshold &&
+      none_of(ExternalUses, [](const ExternalUser &EU) {
+        return isa_and_nonnull<InsertElementInst>(EU.User);
+      }))
+    return Cost;
+
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   InstructionCost ExtractCost = 0;
   SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
@@ -14855,10 +15044,16 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     Value *V = E->isOneOf(E->Scalars.back());
     if (doesNotNeedToBeScheduled(V))
       V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
-    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
-    if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        Res = Bundle->Inst;
+    if (ArrayRef<ScheduleBundle *> Bundles =
+            BlocksSchedules[BB]->getScheduleBundles(V);
+        !Bundles.empty()) {
+      const auto *It = find_if(
+          Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
+      assert(It != Bundles.end() && "Failed to find bundle");
+      Res = (*It)->getBundle().back()->getInst();
+      return *Res;
+    }
+    assert(E->getOpcode() == Instruction::PHI && "Expected PHI");
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -18036,36 +18231,30 @@ void BoUpSLP::optimizeGatherSequence() {
   GatherShuffleExtractSeq.clear();
 }
 
-BoUpSLP::ScheduleData *
+BoUpSLP::ScheduleBundle &
 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
-  ScheduleData *Bundle = nullptr;
-  ScheduleData *PrevInBundle = nullptr;
+  auto &BundlePtr =
+      ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
   for (Value *V : VL) {
     if (doesNotNeedToBeScheduled(V))
       continue;
     ScheduleData *BundleMember = getScheduleData(V);
-    assert(BundleMember &&
-           "no ScheduleData for bundle member "
-           "(maybe not in same basic block)");
-    assert(BundleMember->isSchedulingEntity() &&
-           "bundle member already part of other bundle");
-    if (PrevInBundle) {
-      PrevInBundle->NextInBundle = BundleMember;
-    } else {
-      Bundle = BundleMember;
-    }
-
+    assert(BundleMember && "no ScheduleData for bundle member "
+                           "(maybe not in same basic block)");
     // Group the instructions to a bundle.
-    BundleMember->FirstInBundle = Bundle;
-    PrevInBundle = BundleMember;
-  }
-  assert(Bundle && "Failed to find schedule bundle");
-  return Bundle;
+    BundlePtr->add(BundleMember);
+    ScheduledBundles.try_emplace(cast<Instruction>(V))
+        .first->getSecond()
+        .push_back(BundlePtr.get());
+  }
+  assert(BundlePtr.get() && *BundlePtr.get() &&
+         "Failed to find schedule bundle");
+  return *BundlePtr.get();
 }
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
-std::optional<BoUpSLP::ScheduleData *>
+std::optional<BoUpSLP::ScheduleBundle *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
@@ -18078,23 +18267,23 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   Instruction *OldScheduleEnd = ScheduleEnd;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
-  auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
-                                                         ScheduleData *Bundle) {
+  auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
     // It is seldom that this needs to be done a second time after adding the
     // initial bundle to the region.
-    if (ScheduleEnd != OldScheduleEnd) {
-      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
+    if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         if (ScheduleData *SD = getScheduleData(I))
           SD->clearDependencies();
+      }
       ReSchedule = true;
     }
-    if (Bundle) {
-      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
+    if (Bundle && !Bundle.getBundle().empty()) {
+      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle
                         << " in block " << BB->getName() << "\n");
-      calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
+      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
     }
 
     if (ReSchedule) {
@@ -18105,13 +18294,22 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // Now try to schedule the new bundle or (if no bundle) just calculate
     // dependencies. As soon as the bundle is "ready" it means that there are no
     // cyclic dependencies and we can schedule it. Note that's important that we
-    // don't "schedule" the bundle yet (see cancelScheduling).
-    while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
+    // don't "schedule" the bundle yet.
+    SmallPtrSet<const ScheduleBundle *, 16> Visited;
+    while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
            !ReadyInsts.empty()) {
-      ScheduleData *Picked = ReadyInsts.pop_back_val();
-      assert(Picked->isSchedulingEntity() && Picked->isReady() &&
+      ScheduleEntity *Picked = ReadyInsts.pop_back_val();
+      const auto *PickedBundle = dyn_cast<ScheduleBundle>(Picked);
+      if (PickedBundle && !Visited.insert(PickedBundle).second) {
+        assert(PickedBundle->isScheduled() && "bundle must be scheduled");
+        continue;
+      }
+      assert((PickedBundle ? PickedBundle->isReady()
+                           : cast<ScheduleData>(Picked)->isReady()) &&
              "must be ready to schedule");
       schedule(Picked, ReadyInsts);
+      if (Picked == &Bundle)
+        break;
     }
   };
 
@@ -18127,7 +18325,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       // Otherwise the compiler may crash trying to incorrectly calculate
       // dependencies and emit instruction in the wrong order at the actual
       // scheduling.
-      TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
+      ScheduleBundle Invalid = ScheduleBundle::invalid();
+      TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
       return std::nullopt;
     }
   }
@@ -18143,8 +18342,11 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // Make sure we don't leave the pieces of the bundle in the ready list when
     // whole bundle might not be ready.
     ReadyInsts.remove(BundleMember);
+    if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
+        !Bundles.empty())
+      for_each(Bundles, [&](ScheduleBundle *B) { ReadyInsts.remove(B); });
 
-    if (!BundleMember->IsScheduled)
+    if (!BundleMember->isScheduled())
       continue;
     // A bundle member was scheduled as single instruction before and now
     // needs to be scheduled as part of the bundle. We just get rid of the
@@ -18154,48 +18356,22 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  auto *Bundle = buildBundle(VL);
+  ScheduleBundle &Bundle = buildBundle(VL);
   TryScheduleBundleImpl(ReSchedule, Bundle);
-  if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.getMainOp());
+  if (!Bundle.isReady()) {
+    for (ScheduleData *BD : Bundle.getBundle()) {
+      if (BD->isReady())
+        ReadyInsts.insert(BD);
+    }
+    ScheduledBundlesList.pop_back();
+    for (Value *V : VL) {
+      if (doesNotNeedToBeScheduled(V))
+        continue;
+      ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
+    }
     return std::nullopt;
   }
-  return Bundle;
-}
-
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
-      doesNotNeedToSchedule(VL))
-    return;
-
-  if (doesNotNeedToBeScheduled(OpValue))
-    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
-  ScheduleData *Bundle = getScheduleData(OpValue);
-  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
-  assert(!Bundle->IsScheduled &&
-         "Can't cancel bundle which is already scheduled");
-  assert(Bundle->isSchedulingEntity() &&
-         (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
-         "tried to unbundle something which is not a bundle");
-
-  // Remove the bundle from the ready list.
-  if (Bundle->isReady())
-    ReadyInsts.remove(Bundle);
-
-  // Un-bundle: make single instructions out of the bundle.
-  ScheduleData *BundleMember = Bundle;
-  while (BundleMember) {
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
-    BundleMember->FirstInBundle = BundleMember;
-    ScheduleData *Next = BundleMember->NextInBundle;
-    BundleMember->NextInBundle = nullptr;
-    BundleMember->TE = nullptr;
-    if (BundleMember->unscheduledDepsInBundle() == 0) {
-      ReadyInsts.insert(BundleMember);
-    }
-    BundleMember = Next;
-  }
+  return &Bundle;
 }
 
 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
@@ -18302,7 +18478,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
               Intrinsic::pseudoprobe))) {
       // Update the linked list of memory accessing instructions.
       if (CurrentLoadStore) {
-        CurrentLoadStore->NextLoadStore = SD;
+        CurrentLoadStore->setNextLoadStore(SD);
       } else {
         FirstLoadStoreInRegion = SD;
       }
@@ -18315,182 +18491,199 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   }
   if (NextLoadStore) {
     if (CurrentLoadStore)
-      CurrentLoadStore->NextLoadStore = NextLoadStore;
+      CurrentLoadStore->setNextLoadStore(NextLoadStore);
   } else {
     LastLoadStoreInRegion = CurrentLoadStore;
   }
 }
 
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
                                                      bool InsertInReadyList,
                                                      BoUpSLP *SLP) {
-  assert(SD->isSchedulingEntity());
+  SmallVector<ScheduleData *> WorkList;
+  auto ProcessNode = [&](ScheduleData *BundleMember) {
+    assert(!BundleMember->hasValidDependencies() && "invalid deps expected.");
+    BundleMember->initDependencies();
+    BundleMember->resetUnscheduledDeps();
+    // Handle def-use chain dependencies.
+    for (User *U : BundleMember->getInst()->users()) {
+      if (ScheduleData *UseSD = getScheduleData(U)) {
+        BundleMember->incDependencies();
+        if (!UseSD->isScheduled())
+          BundleMember->incrementUnscheduledDeps(1);
+        WorkList.push_back(UseSD);
+      }
+    }
 
-  SmallVector<ScheduleData *, 10> WorkList;
-  WorkList.push_back(SD);
+    auto MakeControlDependent = [&](Instruction *I) {
+      auto *DepDest = getScheduleData(I);
+      assert(DepDest && "must be in schedule window");
+      DepDest->addControlDependency(BundleMember);
+      BundleMember->incDependencies();
+      if (!DepDest->isScheduled())
+        BundleMember->incrementUnscheduledDeps(1);
+      WorkList.push_back(DepDest);
+    };
 
-  while (!WorkList.empty()) {
-    ScheduleData *SD = WorkList.pop_back_val();
-    for (ScheduleData *BundleMember = SD; BundleMember;
-         BundleMember = BundleMember->NextInBundle) {
-      assert(isInSchedulingRegion(BundleMember));
-      if (BundleMember->hasValidDependencies())
-        continue;
+    // Any instruction which isn't safe to speculate at the beginning of the
+    // block is control depend on any early exit or non-willreturn call
+    // which proceeds it.
+    if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
+      for (Instruction *I = BundleMember->getInst()->getNextNode();
+           I != ScheduleEnd; I = I->getNextNode()) {
+        if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
+          continue;
 
-      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
-                 << "\n");
-      BundleMember->Dependencies = 0;
-      BundleMember->resetUnscheduledDeps();
-
-      // Handle def-use chain dependencies.
-      for (User *U : BundleMember->Inst->users()) {
-        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
-          BundleMember->Dependencies++;
-          ScheduleData *DestBundle = UseSD->FirstInBundle;
-          if (!DestBundle->IsScheduled)
-            BundleMember->incrementUnscheduledDeps(1);
-          if (!DestBundle->hasValidDependencies())
-            WorkList.push_back(DestBundle);
-        }
-      }
+        // Add the dependency
+        MakeControlDependent(I);
 
-      auto MakeControlDependent = [&](Instruction *I) {
-        auto *DepDest = getScheduleData(I);
-        assert(DepDest && "must be in schedule window");
-        DepDest->ControlDependencies.push_back(BundleMember);
-        BundleMember->Dependencies++;
-        ScheduleData *DestBundle = DepDest->FirstInBundle;
-        if (!DestBundle->IsScheduled)
-          BundleMember->incrementUnscheduledDeps(1);
-        if (!DestBundle->hasValidDependencies())
-          WorkList.push_back(DestBundle);
-      };
+        if (!isGuaranteedToTransferExecutionToSuccessor(I))
+          // Everything past here must be control dependent on I.
+          break;
+      }
+    }
 
-      // Any instruction which isn't safe to speculate at the beginning of the
-      // block is control dependend on any early exit or non-willreturn call
-      // which proceeds it.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
-        for (Instruction *I = BundleMember->Inst->getNextNode();
+    if (RegionHasStackSave) {
+      // If we have an inalloc alloca instruction, it needs to be scheduled
+      // after any preceeding stacksave.  We also need to prevent any alloca
+      // from reordering above a preceeding stackrestore.
+      if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
+          match(BundleMember->getInst(),
+                m_Intrinsic<Intrinsic::stackrestore>())) {
+        for (Instruction *I = BundleMember->getInst()->getNextNode();
              I != ScheduleEnd; I = I->getNextNode()) {
-          if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
+          if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
+              match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+            // Any allocas past here must be control dependent on I, and I
+            // must be memory dependend on BundleMember->Inst.
+            break;
+
+          if (!isa<AllocaInst>(I))
             continue;
 
           // Add the dependency
           MakeControlDependent(I);
-
-          if (!isGuaranteedToTransferExecutionToSuccessor(I))
-            // Everything past here must be control dependent on I.
-            break;
         }
       }
 
-      if (RegionHasStackSave) {
-        // If we have an inalloc alloca instruction, it needs to be scheduled
-        // after any preceeding stacksave.  We also need to prevent any alloca
-        // from reordering above a preceeding stackrestore.
-        if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
-            match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
-          for (Instruction *I = BundleMember->Inst->getNextNode();
-               I != ScheduleEnd; I = I->getNextNode()) {
-            if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
-                match(I, m_Intrinsic<Intrinsic::stackrestore>()))
-              // Any allocas past here must be control dependent on I, and I
-              // must be memory dependend on BundleMember->Inst.
-              break;
-
-            if (!isa<AllocaInst>(I))
-              continue;
+      // In addition to the cases handle just above, we need to prevent
+      // allocas and loads/stores from moving below a stacksave or a
+      // stackrestore. Avoiding moving allocas below stackrestore is currently
+      // thought to be conservatism. Moving loads/stores below a stackrestore
+      // can lead to incorrect code.
+      if (isa<AllocaInst>(BundleMember->getInst()) ||
+          BundleMember->getInst()->mayReadOrWriteMemory()) {
+        for (Instruction *I = BundleMember->getInst()->getNextNode();
+             I != ScheduleEnd; I = I->getNextNode()) {
+          if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
+              !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
+            continue;
 
-            // Add the dependency
-            MakeControlDependent(I);
-          }
+          // Add the dependency
+          MakeControlDependent(I);
+          break;
         }
+      }
+    }
 
-        // In addition to the cases handle just above, we need to prevent
-        // allocas and loads/stores from moving below a stacksave or a
-        // stackrestore. Avoiding moving allocas below stackrestore is currently
-        // thought to be conservatism. Moving loads/stores below a stackrestore
-        // can lead to incorrect code.
-        if (isa<AllocaInst>(BundleMember->Inst) ||
-            BundleMember->Inst->mayReadOrWriteMemory()) {
-          for (Instruction *I = BundleMember->Inst->getNextNode();
-               I != ScheduleEnd; I = I->getNextNode()) {
-            if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
-                !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
-              continue;
-
-            // Add the dependency
-            MakeControlDependent(I);
-            break;
-          }
-        }
+    // Handle the memory dependencies (if any).
+    ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
+    if (!NextLoadStore)
+      return;
+    Instruction *SrcInst = BundleMember->getInst();
+    assert(SrcInst->mayReadOrWriteMemory() &&
+           "NextLoadStore list for non memory effecting bundle?");
+    MemoryLocation SrcLoc = getLocation(SrcInst);
+    bool SrcMayWrite = SrcInst->mayWriteToMemory();
+    unsigned NumAliased = 0;
+    unsigned DistToSrc = 1;
+    bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
+
+    for (ScheduleData *DepDest = NextLoadStore; DepDest;
+         DepDest = DepDest->getNextLoadStore()) {
+      assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
+
+      // We have two limits to reduce the complexity:
+      // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+      //    SLP->isAliased (which is the expensive part in this loop).
+      // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+      //    the whole loop (even if the loop is fast, it's quadratic).
+      //    It's important for the loop break condition (see below) to
+      //    check this limit even between two read-only instructions.
+      if (DistToSrc >= MaxMemDepDistance ||
+          ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
+           (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
+            SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
+
+        // We increment the counter only if the locations are aliased
+        // (instead of counting all alias checks). This gives a better
+        // balance between reduced runtime and accurate dependencies.
+        NumAliased++;
+
+        DepDest->addMemoryDependency(BundleMember);
+        BundleMember->incDependencies();
+        if (!DepDest->isScheduled())
+          BundleMember->incrementUnscheduledDeps(1);
+        WorkList.push_back(DepDest);
       }
 
-      // Handle the memory dependencies (if any).
-      ScheduleData *DepDest = BundleMember->NextLoadStore;
-      if (!DepDest)
-        continue;
-      Instruction *SrcInst = BundleMember->Inst;
-      assert(SrcInst->mayReadOrWriteMemory() &&
-             "NextLoadStore list for non memory effecting bundle?");
-      MemoryLocation SrcLoc = getLocation(SrcInst);
-      bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
-      unsigned NumAliased = 0;
-      unsigned DistToSrc = 1;
-
-      for (; DepDest; DepDest = DepDest->NextLoadStore) {
-        assert(isInSchedulingRegion(DepDest));
-
-        // We have two limits to reduce the complexity:
-        // 1) AliasedCheckLimit: It's a small limit to reduce calls to
-        //    SLP->isAliased (which is the expensive part in this loop).
-        // 2) MaxMemDepDistance: It's for very large blocks and it aborts
-        //    the whole loop (even if the loop is fast, it's quadratic).
-        //    It's important for the loop break condition (see below) to
-        //    check this limit even between two read-only instructions.
-        if (DistToSrc >= MaxMemDepDistance ||
-            ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
-             (NumAliased >= AliasedCheckLimit ||
-              SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
-
-          // We increment the counter only if the locations are aliased
-          // (instead of counting all alias checks). This gives a better
-          // balance between reduced runtime and accurate dependencies.
-          NumAliased++;
-
-          DepDest->MemoryDependencies.push_back(BundleMember);
-          BundleMember->Dependencies++;
-          ScheduleData *DestBundle = DepDest->FirstInBundle;
-          if (!DestBundle->IsScheduled) {
-            BundleMember->incrementUnscheduledDeps(1);
-          }
-          if (!DestBundle->hasValidDependencies()) {
-            WorkList.push_back(DestBundle);
-          }
-        }
+      // Example, explaining the loop break condition: Let's assume our
+      // starting instruction is i0 and MaxMemDepDistance = 3.
+      //
+      //                      +--------v--v--v
+      //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+      //             +--------^--^--^
+      //
+      // MaxMemDepDistance let us stop alias-checking at i3 and we add
+      // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+      // Previously we already added dependencies from i3 to i6,i7,i8
+      // (because of MaxMemDepDistance). As we added a dependency from
+      // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+      // and we can abort this loop at i6.
+      if (DistToSrc >= 2 * MaxMemDepDistance)
+        break;
+      DistToSrc++;
+    }
+  };
 
-        // Example, explaining the loop break condition: Let's assume our
-        // starting instruction is i0 and MaxMemDepDistance = 3.
-        //
-        //                      +--------v--v--v
-        //             i0,i1,i2,i3,i4,i5,i6,i7,i8
-        //             +--------^--^--^
-        //
-        // MaxMemDepDistance let us stop alias-checking at i3 and we add
-        // dependencies from i0 to i3,i4,.. (even if they are not aliased).
-        // Previously we already added dependencies from i3 to i6,i7,i8
-        // (because of MaxMemDepDistance). As we added a dependency from
-        // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
-        // and we can abort this loop at i6.
-        if (DistToSrc >= 2 * MaxMemDepDistance)
-          break;
-        DistToSrc++;
+  WorkList.push_back(Bundle.getBundle().front());
+  SmallPtrSet<ScheduleBundle *, 16> Visited;
+  while (!WorkList.empty()) {
+    ScheduleData *SD = WorkList.pop_back_val();
+    ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst());
+    if (!Bundles.empty()) {
+      for (ScheduleBundle *Bundle : Bundles) {
+        if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies())
+          continue;
+        assert(isInSchedulingRegion(*Bundle) &&
+               "ScheduleData not in scheduling region");
+        for (ScheduleData *BundleMember : Bundle->getBundle()) {
+          if (BundleMember->hasValidDependencies())
+            continue;
+          LLVM_DEBUG(dbgs()
+                     << "SLP:       update deps of " << *BundleMember << "\n");
+          ProcessNode(BundleMember);
+        }
       }
+    } else if (!SD->hasValidDependencies()) {
+      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *SD << "\n");
+      ProcessNode(SD);
     }
     if (InsertInReadyList && SD->isReady()) {
+      if (!Bundles.empty()) {
+        for (ScheduleBundle *Bundle : Bundles) {
+          assert(isInSchedulingRegion(*Bundle) &&
+                 "ScheduleData not in scheduling region");
+          if (Bundle->isReady()) {
+            ReadyInsts.insert(Bundle);
+            LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *Bundle
+                            << "\n");
+          }
+        }
+        continue;
+      }
       ReadyInsts.insert(SD);
-      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
-                        << "\n");
+      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD << "\n");
     }
   }
 }
@@ -18502,7 +18695,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
     if (ScheduleData *SD = getScheduleData(I)) {
       assert(isInSchedulingRegion(SD) &&
              "ScheduleData not in scheduling region");
-      SD->IsScheduled = false;
+      SD->setScheduled(/*Scheduled=*/false);
       SD->resetUnscheduledDeps();
     }
   }
@@ -18528,28 +18721,34 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   // WARNING: If changing this order causes a correctness issue, that means
   // there is some missing dependence edge in the schedule data graph.
   struct ScheduleDataCompare {
-    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
-      return SD2->SchedulingPriority < SD1->SchedulingPriority;
+    bool operator()(const ScheduleEntity *SD1,
+                    const ScheduleEntity *SD2) const {
+      return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
     }
   };
-  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+  std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
 
   // Ensure that all dependency data is updated (for nodes in the sub-graph)
   // and fill the ready-list with initial instructions.
   int Idx = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
+    ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
+    if (!Bundles.empty()) {
+      for (ScheduleBundle *Bundle : Bundles) {
+        Bundle->setSchedulingPriority(Idx++);
+        if (!Bundle->hasValidDependencies())
+          BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
+      }
+      continue;
+    }
     if (ScheduleData *SD = BS->getScheduleData(I)) {
-      [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
-      assert((isVectorLikeInstWithConstOps(SD->Inst) ||
-              SD->isPartOfBundle() ==
-                  (!SDTEs.empty() &&
-                   !doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
+      [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
+      assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
+              doesNotNeedToSchedule(SDTEs.front()->Scalars)) &&
              "scheduler and vectorizer bundle mismatch");
-      SD->FirstInBundle->SchedulingPriority = Idx++;
-
-      if (SD->isSchedulingEntity() && SD->isPartOfBundle())
-        BS->calculateDependencies(SD, false, this);
+      SD->setSchedulingPriority(Idx++);
+      continue;
     }
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -18558,19 +18757,25 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
   // Do the "real" scheduling.
   while (!ReadyInsts.empty()) {
-    ScheduleData *Picked = *ReadyInsts.begin();
+    auto *Picked = *ReadyInsts.begin();
     ReadyInsts.erase(ReadyInsts.begin());
 
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
-    for (ScheduleData *BundleMember = Picked; BundleMember;
-         BundleMember = BundleMember->NextInBundle) {
-      Instruction *PickedInst = BundleMember->Inst;
+    if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
+      for (const ScheduleData *BundleMember : Bundle->getBundle()) {
+        Instruction *PickedInst = BundleMember->getInst();
+        if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+          PickedInst->moveAfter(LastScheduledInst->getPrevNode());
+        LastScheduledInst = PickedInst;
+      }
+    } else {
+      auto *SD = cast<ScheduleData>(Picked);
+      Instruction *PickedInst = SD->getInst();
       if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
-
     BS->schedule(Picked, ReadyInsts);
   }
 
@@ -18581,10 +18786,13 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
   // Check that all schedulable entities got scheduled
-  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
-    ScheduleData *SD = BS->getScheduleData(I);
-    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
-      assert(SD->IsScheduled && "must be scheduled at this point");
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
+    if (!Bundles.empty()) {
+      for (ScheduleBundle *Bundle : Bundles)
+        assert(Bundle->isScheduled() && "must be scheduled at this point");
+    }
   }
 #endif
 
@@ -21201,10 +21409,9 @@ class HorizontalReduction {
         V.computeMinimumValueSizes();
 
         // Estimate cost.
-        InstructionCost TreeCost = V.getTreeCost(VL);
         InstructionCost ReductionCost =
             getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
-        InstructionCost Cost = TreeCost + ReductionCost;
+        InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
                           << " for reduction\n");
         if (!Cost.isValid())

>From ce81ce10ff9ac374cda97af055564dc6a1e5b374 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 17 Mar 2025 15:22:58 +0000
Subject: [PATCH 2/2] Fix formatting

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6766b68841a9a..cbae0917648dd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4428,7 +4428,7 @@ class BoUpSLP {
 
         auto DecrUnschedForInst = [&](Instruction *I) {
           if (ScheduleData *OpSD = getScheduleData(I))
-            DecrUnsched(OpSD, /*IsControl*/false);
+            DecrUnsched(OpSD, /*IsControl=*/false);
         };
 
         // If BundleMember is a vector bundle, its operands may have been
@@ -4476,9 +4476,8 @@ class BoUpSLP {
         for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
           // There are no more unscheduled dependencies after decrementing,
           // so we can put the dependent instruction into the ready list.
-          LLVM_DEBUG(dbgs()
-                     << "SLP:   check for readiness (mem): " << *MemoryDep
-                     << "\n");
+          LLVM_DEBUG(dbgs() << "SLP:   check for readiness (mem): "
+                            << *MemoryDep << "\n");
           DecrUnsched(MemoryDep);
         }
         // Handle the control dependencies.
@@ -9076,8 +9075,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     copy(Op1, NewVL.begin());
     copy(Op2, std::next(NewVL.begin(), Op1.size()));
     auto Invalid = ScheduleBundle::invalid();
-    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid,
-                            LocalState, UserTreeIdx, {}, ReorderIndices);
+    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
+                            UserTreeIdx, {}, ReorderIndices);
     LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
     auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
       InstructionsState S = getSameOpcode(Op, *TLI);
@@ -9086,8 +9085,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Build gather node for loads, they will be gathered later.
         TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
                                                     Idx == 0 ? 0 : Op1.size());
-        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S,
-                           {TE, Idx});
+        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
       } else {
         TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
                                                     Idx == 0 ? 0 : Op1.size());
@@ -9224,7 +9222,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (isVectorized(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
-      if (TryToFindDuplicates(S)){
+      if (TryToFindDuplicates(S)) {
         auto Invalid = ScheduleBundle::invalid();
         newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndices);
@@ -18281,8 +18279,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       ReSchedule = true;
     }
     if (Bundle && !Bundle.getBundle().empty()) {
-      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle
-                        << " in block " << BB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
+                        << BB->getName() << "\n");
       calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
     }
 
@@ -18676,8 +18674,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
                  "ScheduleData not in scheduling region");
           if (Bundle->isReady()) {
             ReadyInsts.insert(Bundle);
-            LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *Bundle
-                            << "\n");
+            LLVM_DEBUG(dbgs()
+                       << "SLP:     gets ready on update: " << *Bundle << "\n");
           }
         }
         continue;



More information about the llvm-commits mailing list