[llvm] [SLP] Improve block traversal in getSpillCost() (PR #128620)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 19:01:35 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Mikhail R. Gadelha (mikhailramalho)
<details>
<summary>Changes</summary>
This is a WIP patch due to the compilation time regressions, up to 7% on gcc_r/gcc_s.
Previously, getSpillCost would skip in between blocks when traversing instructions backward. If one of the missing blocks has a function call, the existing logic would lead to incorrect spill cost calculations.
The new implementation:
- Uses post_order traversal to visit blocks
- Tracks live entries across basic blocks
- Computes reachable blocks once upfront using depth_first_ext
Performance improvements:
- Reduces execution time of SPEC CPU benchmark 544.nab_r by 9.92%
- Reduces code size of 508.namd by 1.73%
This optimization improves vectorization decisions by making spill cost estimation more accurate, particularly for code with complex control flow.
---
Full diff: https://github.com/llvm/llvm-project/pull/128620.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+52-74)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll (+12-4)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll (+16-6)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3d660b63309d4..2d23307c18839 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12415,68 +12415,48 @@ InstructionCost BoUpSLP::getSpillCost() {
InstructionCost Cost = 0;
SmallPtrSet<const TreeEntry *, 4> LiveEntries;
- const TreeEntry *Prev = nullptr;
-
- // The entries in VectorizableTree are not necessarily ordered by their
- // position in basic blocks. Collect them and order them by dominance so later
- // instructions are guaranteed to be visited first. For instructions in
- // different basic blocks, we only scan to the beginning of the block, so
- // their order does not matter, as long as all instructions in a basic block
- // are grouped together. Using dominance ensures a deterministic order.
- SmallVector<TreeEntry *, 16> OrderedEntries;
- for (const auto &TEPtr : VectorizableTree) {
- if (TEPtr->isGather())
- continue;
- OrderedEntries.push_back(TEPtr.get());
- }
- llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
- const TreeEntry *TB) {
- Instruction &A = getLastInstructionInBundle(TA);
- Instruction &B = getLastInstructionInBundle(TB);
- auto *NodeA = DT->getNode(A.getParent());
- auto *NodeB = DT->getNode(B.getParent());
- assert(NodeA && "Should only process reachable instructions");
- assert(NodeB && "Should only process reachable instructions");
- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
- "Different nodes should have different DFS numbers");
- if (NodeA != NodeB)
- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
- return B.comesBefore(&A);
- });
- for (const TreeEntry *TE : OrderedEntries) {
- if (!Prev) {
- Prev = TE;
+ const TreeEntry *Root = VectorizableTree.front().get();
+ BasicBlock *RootBB = cast<Instruction>(Root->Scalars[0])->getParent();
+
+ // Compute what nodes are reachable from the leaves to the roots
+ df_iterator_default_set<const BasicBlock *> ReachableFromLeaves;
+ for (auto &TE : VectorizableTree) {
+ if (TE->isGather())
continue;
- }
+ auto *BB = getLastInstructionInBundle(TE.get()).getParent();
+ for (const BasicBlock *X : depth_first_ext(BB, ReachableFromLeaves))
+ ReachableFromLeaves.insert(X);
+ }
- LiveEntries.erase(Prev);
- for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
- const TreeEntry *Op = getVectorizedOperand(Prev, I);
- if (!Op)
- continue;
- assert(!Op->isGather() && "Expected vectorized operand.");
- LiveEntries.insert(Op);
- }
+ DenseSet<const BasicBlock *> Reachable;
+ for (const BasicBlock *X : inverse_depth_first(RootBB))
+ Reachable.insert(X);
+ set_intersect(Reachable, ReachableFromLeaves);
- LLVM_DEBUG({
- dbgs() << "SLP: #LV: " << LiveEntries.size();
- for (auto *X : LiveEntries)
- X->dump();
- dbgs() << ", Looking at ";
- TE->dump();
- });
+ DenseSet<const TreeEntry *> Defined;
- // Now find the sequence of instructions between PrevInst and Inst.
- unsigned NumCalls = 0;
- const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
- BasicBlock::const_reverse_iterator
- InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
- PrevInstIt = PrevInst->getIterator().getReverse();
- while (InstIt != PrevInstIt) {
- if (PrevInstIt == PrevInst->getParent()->rend()) {
- PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
- continue;
+ // Iterate the tree from the root, post order so that all uses appear before
+ // definitions.
+ // TODO: LiveEntries are shared across all paths, so this may overestimate.
+ for (BasicBlock *BB : post_order(RootBB->getParent())) {
+ if (!Reachable.contains(BB))
+ continue;
+
+ for (Instruction &I : reverse(*BB)) {
+ for (const auto *TE : getTreeEntries(&I)) {
+ if (TE->isGather())
+ continue;
+ LiveEntries.erase(TE);
+ Defined.insert(TE);
+ for (unsigned Idx : seq<unsigned>(TE->getNumOperands())) {
+ const TreeEntry *Op = getVectorizedOperand(TE, Idx);
+ if (!Op)
+ continue;
+ assert(!Op->isGather() && "Expected vectorized operand.");
+ if (!Defined.contains(Op))
+ LiveEntries.insert(Op);
+ }
}
auto NoCallIntrinsic = [this](const Instruction *I) {
@@ -12497,26 +12477,24 @@ InstructionCost BoUpSLP::getSpillCost() {
// Debug information does not impact spill cost.
// Vectorized calls, represented as vector intrinsics, do not impact spill
// cost.
- if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
- CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
- NumCalls++;
-
- ++PrevInstIt;
- }
+ if (const auto *CB = dyn_cast<CallBase>(&I);
+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
+ SmallVector<Type *, 4> EntriesTypes;
+ for (const TreeEntry *TE : LiveEntries) {
+ auto *ScalarTy = TE->getMainOp()->getType();
+ auto It = MinBWs.find(TE);
+ if (It != MinBWs.end())
+ ScalarTy =
+ IntegerType::get(ScalarTy->getContext(), It->second.first);
+ EntriesTypes.push_back(
+ getWidenedType(ScalarTy, TE->getVectorFactor()));
+ }
- if (NumCalls) {
- SmallVector<Type *, 4> EntriesTypes;
- for (const TreeEntry *TE : LiveEntries) {
- auto *ScalarTy = TE->getMainOp()->getType();
- auto It = MinBWs.find(TE);
- if (It != MinBWs.end())
- ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
- EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
+ LLVM_DEBUG(dbgs() << "SLP: " << LiveEntries.size()
+ << " entries alive over call:" << I << "\n");
+ Cost += TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
}
- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
}
-
- Prev = TE;
}
return Cost;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 8cb620f870331..fc71643f6a51d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -1740,7 +1740,9 @@ entry:
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: define void @f
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; CHECK: foo:
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; CHECK-NEXT: br label [[BAZ]]
; CHECK: baz:
-; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
; DEFAULT-LABEL: define void @f
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
+; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
; DEFAULT: foo:
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
; DEFAULT-NEXT: br label [[BAZ]]
; DEFAULT: baz:
-; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
+; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; DEFAULT-NEXT: ret void
;
%x0 = load i64, ptr %p
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
index b0c25bc4cc1f2..5e91616c2beb9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
@@ -7,7 +7,9 @@ declare void @g()
define void @f0(i1 %c, ptr %p, ptr %q) {
; CHECK-LABEL: define void @f0(
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
; CHECK: [[FOO]]:
; CHECK-NEXT: call void @g()
@@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
; CHECK-NEXT: call void @g()
; CHECK-NEXT: br label %[[BAZ]]
; CHECK: [[BAZ]]:
-; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
%x0 = load i64, ptr %p
@@ -50,10 +54,13 @@ define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: define void @f1(
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
; CHECK-NEXT: br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
; CHECK: [[FOO]]:
-; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
+; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 1
; CHECK-NEXT: br label %[[BAZ:.*]]
; CHECK: [[BAR]]:
; CHECK-NEXT: call void @g()
@@ -61,8 +68,11 @@ define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
; CHECK-NEXT: call void @g()
; CHECK-NEXT: br label %[[BAZ]]
; CHECK: [[BAZ]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i64> [ [[TMP1]], %[[FOO]] ], [ [[TMP0]], %[[BAR]] ]
-; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[Q]], align 8
+; CHECK-NEXT: [[PHI0:%.*]] = phi i64 [ [[Y0]], %[[FOO]] ], [ [[X0]], %[[BAR]] ]
+; CHECK-NEXT: [[PHI1:%.*]] = phi i64 [ [[Y1]], %[[FOO]] ], [ [[X1]], %[[BAR]] ]
+; CHECK-NEXT: store i64 [[PHI0]], ptr [[Q]], align 8
+; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT: store i64 [[PHI1]], ptr [[Q1]], align 8
; CHECK-NEXT: ret void
;
entry:
``````````
</details>
https://github.com/llvm/llvm-project/pull/128620
More information about the llvm-commits
mailing list