[llvm] [SLP] Improve block traversal in getSpillCost() (PR #128620)

Tue Feb 25 08:26:25 PST 2025

================
@@ -12415,68 +12415,48 @@ InstructionCost BoUpSLP::getSpillCost() {
   InstructionCost Cost = 0;
 
   SmallPtrSet<const TreeEntry *, 4> LiveEntries;
-  const TreeEntry *Prev = nullptr;
-
-  // The entries in VectorizableTree are not necessarily ordered by their
-  // position in basic blocks. Collect them and order them by dominance so later
-  // instructions are guaranteed to be visited first. For instructions in
-  // different basic blocks, we only scan to the beginning of the block, so
-  // their order does not matter, as long as all instructions in a basic block
-  // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<TreeEntry *, 16> OrderedEntries;
-  for (const auto &TEPtr : VectorizableTree) {
-    if (TEPtr->isGather())
-      continue;
-    OrderedEntries.push_back(TEPtr.get());
-  }
-  llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
-                                        const TreeEntry *TB) {
-    Instruction &A = getLastInstructionInBundle(TA);
-    Instruction &B = getLastInstructionInBundle(TB);
-    auto *NodeA = DT->getNode(A.getParent());
-    auto *NodeB = DT->getNode(B.getParent());
-    assert(NodeA && "Should only process reachable instructions");
-    assert(NodeB && "Should only process reachable instructions");
-    assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
-           "Different nodes should have different DFS numbers");
-    if (NodeA != NodeB)
-      return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
-    return B.comesBefore(&A);
-  });
 
-  for (const TreeEntry *TE : OrderedEntries) {
-    if (!Prev) {
-      Prev = TE;
+  const TreeEntry *Root = VectorizableTree.front().get();
+  BasicBlock *RootBB = cast<Instruction>(Root->Scalars[0])->getParent();
+
+  // Compute what nodes are reachable from the leaves to the roots
+  df_iterator_default_set<const BasicBlock *> ReachableFromLeaves;
+  for (auto &TE : VectorizableTree) {
+    if (TE->isGather())
       continue;
-    }
+    auto *BB = getLastInstructionInBundle(TE.get()).getParent();
+    for (const BasicBlock *X : depth_first_ext(BB, ReachableFromLeaves))
+      ReachableFromLeaves.insert(X);
+  }
 
-    LiveEntries.erase(Prev);
-    for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
-      const TreeEntry *Op = getVectorizedOperand(Prev, I);
-      if (!Op)
-        continue;
-      assert(!Op->isGather() && "Expected vectorized operand.");
-      LiveEntries.insert(Op);
-    }
+  DenseSet<const BasicBlock *> Reachable;
+  for (const BasicBlock *X : inverse_depth_first(RootBB))
+    Reachable.insert(X);
+  set_intersect(Reachable, ReachableFromLeaves);
 
-    LLVM_DEBUG({
-      dbgs() << "SLP: #LV: " << LiveEntries.size();
-      for (auto *X : LiveEntries)
-        X->dump();
-      dbgs() << ", Looking at ";
-      TE->dump();
-    });
+  DenseSet<const TreeEntry *> Defined;
 
-    // Now find the sequence of instructions between PrevInst and Inst.
-    unsigned NumCalls = 0;
-    const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
-    BasicBlock::const_reverse_iterator
-        InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
-        PrevInstIt = PrevInst->getIterator().getReverse();
-    while (InstIt != PrevInstIt) {
-      if (PrevInstIt == PrevInst->getParent()->rend()) {
----------------
preames wrote:

If I'm reading the old code right, we only scanned the prefix of the block containing the using bundle, and then ignored any calls between definition and the end of it's block, plus any in intervening blocks?  

If so, a couple of question/suggestions.

1) What happens if you add a huge cost for any case not contained in the same basic block?  (i.e. strongly disincentivize long cross block cases)
2) What happens if you handle only "easy" control flow?  Things like a series of if-else or if-clause, with some small finite depth?
3) Is there some invariant which insures none of these nodes can be in loops?  If not, do we need to scale by trip count in some way?

https://github.com/llvm/llvm-project/pull/128620