[llvm] [SLP] Improve block traversal in getSpillCost() (PR #128620)

Mon Feb 24 19:00:56 PST 2025

https://github.com/mikhailramalho created https://github.com/llvm/llvm-project/pull/128620

This is a WIP patch due to the compilation time regressions, up to 7% on gcc_r/gcc_s.

Previously, getSpillCost would skip in between blocks when traversing instructions backward. If one of the missing blocks has a function call, the existing logic would lead to incorrect spill cost calculations.

The new implementation:
- Uses post_order traversal to visit blocks
- Tracks live entries across basic blocks
- Computes reachable blocks once upfront using depth_first_ext

Performance improvements:
- Reduces execution time of SPEC CPU benchmark 544.nab_r by 9.92%
- Reduces code size of 508.namd by 1.73%

This optimization improves vectorization decisions by making spill cost estimation more accurate, particularly for code with complex control flow.

>From a6300f77974b9a706eec966af1f1835173251623 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Wed, 19 Feb 2025 12:38:54 -0300
Subject: [PATCH] [SLP] Improve block traversal in getSpillCost()

This is a WIP patch due to the compilation time regressions, up to 7% on
gcc_r/gcc_s.

Previously, getSpillCost would skip in between blocks when traversing
instructions backward. If one of the missing blocks has a function call,
the existing logic would lead to incorrect spill cost calculations.

The new implementation:
- Uses post_order traversal to visit blocks
- Tracks live entries across basic blocks
- Computes reachable blocks once upfront using depth_first_ext
- Maintains correct cost calculation for diamond-shaped control flow

Performance improvements:
- Reduces execution time of SPEC CPU benchmark 544.nab_r by 9.92%
- Reduces code size of 508.namd by 1.73%

This optimization improves vectorization decisions by making spill cost
estimation more accurate, particularly for code with complex control flow.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 126 ++++++++----------
 .../SLPVectorizer/RISCV/math-function.ll      |  16 ++-
 .../SLPVectorizer/RISCV/spillcost.ll          |  22 ++-
 3 files changed, 80 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3d660b63309d4..2d23307c18839 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12415,68 +12415,48 @@ InstructionCost BoUpSLP::getSpillCost() {
   InstructionCost Cost = 0;
 
   SmallPtrSet<const TreeEntry *, 4> LiveEntries;
-  const TreeEntry *Prev = nullptr;
-
-  // The entries in VectorizableTree are not necessarily ordered by their
-  // position in basic blocks. Collect them and order them by dominance so later
-  // instructions are guaranteed to be visited first. For instructions in
-  // different basic blocks, we only scan to the beginning of the block, so
-  // their order does not matter, as long as all instructions in a basic block
-  // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<TreeEntry *, 16> OrderedEntries;
-  for (const auto &TEPtr : VectorizableTree) {
-    if (TEPtr->isGather())
-      continue;
-    OrderedEntries.push_back(TEPtr.get());
-  }
-  llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
-                                        const TreeEntry *TB) {
-    Instruction &A = getLastInstructionInBundle(TA);
-    Instruction &B = getLastInstructionInBundle(TB);
-    auto *NodeA = DT->getNode(A.getParent());
-    auto *NodeB = DT->getNode(B.getParent());
-    assert(NodeA && "Should only process reachable instructions");
-    assert(NodeB && "Should only process reachable instructions");
-    assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
-           "Different nodes should have different DFS numbers");
-    if (NodeA != NodeB)
-      return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
-    return B.comesBefore(&A);
-  });
 
-  for (const TreeEntry *TE : OrderedEntries) {
-    if (!Prev) {
-      Prev = TE;
+  const TreeEntry *Root = VectorizableTree.front().get();
+  BasicBlock *RootBB = cast<Instruction>(Root->Scalars[0])->getParent();
+
+  // Compute what nodes are reachable from the leaves to the roots
+  df_iterator_default_set<const BasicBlock *> ReachableFromLeaves;
+  for (auto &TE : VectorizableTree) {
+    if (TE->isGather())
       continue;
-    }
+    auto *BB = getLastInstructionInBundle(TE.get()).getParent();
+    for (const BasicBlock *X : depth_first_ext(BB, ReachableFromLeaves))
+      ReachableFromLeaves.insert(X);
+  }
 
-    LiveEntries.erase(Prev);
-    for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
-      const TreeEntry *Op = getVectorizedOperand(Prev, I);
-      if (!Op)
-        continue;
-      assert(!Op->isGather() && "Expected vectorized operand.");
-      LiveEntries.insert(Op);
-    }
+  DenseSet<const BasicBlock *> Reachable;
+  for (const BasicBlock *X : inverse_depth_first(RootBB))
+    Reachable.insert(X);
+  set_intersect(Reachable, ReachableFromLeaves);
 
-    LLVM_DEBUG({
-      dbgs() << "SLP: #LV: " << LiveEntries.size();
-      for (auto *X : LiveEntries)
-        X->dump();
-      dbgs() << ", Looking at ";
-      TE->dump();
-    });
+  DenseSet<const TreeEntry *> Defined;
 
-    // Now find the sequence of instructions between PrevInst and Inst.
-    unsigned NumCalls = 0;
-    const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
-    BasicBlock::const_reverse_iterator
-        InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
-        PrevInstIt = PrevInst->getIterator().getReverse();
-    while (InstIt != PrevInstIt) {
-      if (PrevInstIt == PrevInst->getParent()->rend()) {
-        PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
-        continue;
+  // Iterate the tree from the root, post order so that all uses appear before
+  // definitions.
+  // TODO: LiveEntries are shared across all paths, so this may overestimate.
+  for (BasicBlock *BB : post_order(RootBB->getParent())) {
+    if (!Reachable.contains(BB))
+      continue;
+
+    for (Instruction &I : reverse(*BB)) {
+      for (const auto *TE : getTreeEntries(&I)) {
+        if (TE->isGather())
+          continue;
+        LiveEntries.erase(TE);
+        Defined.insert(TE);
+        for (unsigned Idx : seq<unsigned>(TE->getNumOperands())) {
+          const TreeEntry *Op = getVectorizedOperand(TE, Idx);
+          if (!Op)
+            continue;
+          assert(!Op->isGather() && "Expected vectorized operand.");
+          if (!Defined.contains(Op))
+            LiveEntries.insert(Op);
+        }
       }
 
       auto NoCallIntrinsic = [this](const Instruction *I) {
@@ -12497,26 +12477,24 @@ InstructionCost BoUpSLP::getSpillCost() {
       // Debug information does not impact spill cost.
       // Vectorized calls, represented as vector intrinsics, do not impact spill
       // cost.
-      if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
-          CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
-        NumCalls++;
-
-      ++PrevInstIt;
-    }
+      if (const auto *CB = dyn_cast<CallBase>(&I);
+          CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
+        SmallVector<Type *, 4> EntriesTypes;
+        for (const TreeEntry *TE : LiveEntries) {
+          auto *ScalarTy = TE->getMainOp()->getType();
+          auto It = MinBWs.find(TE);
+          if (It != MinBWs.end())
+            ScalarTy =
+                IntegerType::get(ScalarTy->getContext(), It->second.first);
+          EntriesTypes.push_back(
+              getWidenedType(ScalarTy, TE->getVectorFactor()));
+        }
 
-    if (NumCalls) {
-      SmallVector<Type *, 4> EntriesTypes;
-      for (const TreeEntry *TE : LiveEntries) {
-        auto *ScalarTy = TE->getMainOp()->getType();
-        auto It = MinBWs.find(TE);
-        if (It != MinBWs.end())
-          ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
-        EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
+        LLVM_DEBUG(dbgs() << "SLP: " << LiveEntries.size()
+                          << " entries alive over call:" << I << "\n");
+        Cost += TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
       }
-      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
     }
-
-    Prev = TE;
   }
 
   return Cost;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 8cb620f870331..fc71643f6a51d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -1740,7 +1740,9 @@ entry:
 define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; CHECK-NEXT:    br label [[BAZ]]
 ; CHECK:       baz:
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; DEFAULT-LABEL: define void @f
 ; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; DEFAULT-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; DEFAULT-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; DEFAULT:       foo:
 ; DEFAULT-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; DEFAULT-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; DEFAULT-NEXT:    br label [[BAZ]]
 ; DEFAULT:       baz:
-; DEFAULT-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; DEFAULT-NEXT:    ret void
 ;
   %x0 = load i64, ptr %p
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
index b0c25bc4cc1f2..5e91616c2beb9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
@@ -7,7 +7,9 @@ declare void @g()
 define void @f0(i1 %c, ptr %p, ptr %q) {
 ; CHECK-LABEL: define void @f0(
 ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
 ; CHECK:       [[FOO]]:
 ; CHECK-NEXT:    call void @g()
@@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
 ; CHECK-NEXT:    call void @g()
 ; CHECK-NEXT:    br label %[[BAZ]]
 ; CHECK:       [[BAZ]]:
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x0 = load i64, ptr %p
@@ -50,10 +54,13 @@ define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-LABEL: define void @f1(
 ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
 ; CHECK:       [[FOO]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 1
 ; CHECK-NEXT:    br label %[[BAZ:.*]]
 ; CHECK:       [[BAR]]:
 ; CHECK-NEXT:    call void @g()
@@ -61,8 +68,11 @@ define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-NEXT:    call void @g()
 ; CHECK-NEXT:    br label %[[BAZ]]
 ; CHECK:       [[BAZ]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i64> [ [[TMP1]], %[[FOO]] ], [ [[TMP0]], %[[BAR]] ]
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[PHI0:%.*]] = phi i64 [ [[Y0]], %[[FOO]] ], [ [[X0]], %[[BAR]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i64 [ [[Y1]], %[[FOO]] ], [ [[X1]], %[[BAR]] ]
+; CHECK-NEXT:    store i64 [[PHI0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[PHI1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry: