[llvm] [SLP]Fix/improve getSpillCost analysis (PR #129258)

Fri Feb 28 07:12:08 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

Previous implementation may took some extra time, when walked over the
same instructions several times. And also it did not include proper
analysis for cross-basic-block use of the vectorized values. This
version fixes it.

It walks over the tree and checks the deps between entries and their
operands. If there are non-vectorized calls in between, it adds
a single(!) spill cost, because the vector value should be
spilled/reloaded only once.

Also, this version caches analysis for each entries, which are detected,
and do not repeats it, uses data, found during previous analysis for
previous nodes.

Also, it has the internal limit. If the number of instructions
between nodes and their operands is too big (> than ScheduleRegionSizeBudget / VectorizableTree.size()), it is considered that the spill is required. It allows to improve compile time.


---
Full diff: https://github.com/llvm/llvm-project/pull/129258.diff


3 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+182-90) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll (+12-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll (+7-3) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2d343ca31f07b..6c48ae7d645ab 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12448,109 +12448,201 @@ InstructionCost BoUpSLP::getSpillCost() {
   // (for example, if spills and fills are required).
   InstructionCost Cost = 0;
 
-  SmallPtrSet<const TreeEntry *, 4> LiveEntries;
-  const TreeEntry *Prev = nullptr;
-
-  // The entries in VectorizableTree are not necessarily ordered by their
-  // position in basic blocks. Collect them and order them by dominance so later
-  // instructions are guaranteed to be visited first. For instructions in
-  // different basic blocks, we only scan to the beginning of the block, so
-  // their order does not matter, as long as all instructions in a basic block
-  // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<TreeEntry *, 16> OrderedEntries;
-  for (const auto &TEPtr : VectorizableTree) {
-    if (TEPtr->isGather())
-      continue;
-    OrderedEntries.push_back(TEPtr.get());
-  }
-  llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
-                                        const TreeEntry *TB) {
-    Instruction &A = getLastInstructionInBundle(TA);
-    Instruction &B = getLastInstructionInBundle(TB);
-    auto *NodeA = DT->getNode(A.getParent());
-    auto *NodeB = DT->getNode(B.getParent());
-    assert(NodeA && "Should only process reachable instructions");
-    assert(NodeB && "Should only process reachable instructions");
-    assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
-           "Different nodes should have different DFS numbers");
-    if (NodeA != NodeB)
-      return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
-    return B.comesBefore(&A);
-  });
-
-  for (const TreeEntry *TE : OrderedEntries) {
-    if (!Prev) {
-      Prev = TE;
-      continue;
-    }
+  const TreeEntry *Root = VectorizableTree.front().get();
+  if (Root->isGather())
+    return Cost;
 
-    LiveEntries.erase(Prev);
-    for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
-      const TreeEntry *Op = getVectorizedOperand(Prev, I);
-      if (!Op)
-        continue;
-      assert(!Op->isGather() && "Expected vectorized operand.");
-      LiveEntries.insert(Op);
+  SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
+      EntriesToOperands;
+  SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
+  SmallPtrSet<const Instruction *, 8> LastInstructions;
+  for (const auto &TEPtr : VectorizableTree) {
+    if (!TEPtr->isGather()) {
+      Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
+      EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
+      LastInstructions.insert(LastInst);
     }
+    if (TEPtr->UserTreeIndex)
+      EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
+  }
 
-    LLVM_DEBUG({
-      dbgs() << "SLP: #LV: " << LiveEntries.size();
-      for (auto *X : LiveEntries)
-        X->dump();
-      dbgs() << ", Looking at ";
-      TE->dump();
-    });
-
-    // Now find the sequence of instructions between PrevInst and Inst.
-    unsigned NumCalls = 0;
-    const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
-    BasicBlock::const_reverse_iterator
-        InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
-        PrevInstIt = PrevInst->getIterator().getReverse();
-    while (InstIt != PrevInstIt) {
-      if (PrevInstIt == PrevInst->getParent()->rend()) {
-        PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
-        continue;
-      }
-
-      auto NoCallIntrinsic = [this](const Instruction *I) {
-        const auto *II = dyn_cast<IntrinsicInst>(I);
-        if (!II)
-          return false;
-        if (II->isAssumeLikeIntrinsic())
-          return true;
-        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
-        InstructionCost IntrCost =
-            TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
-        InstructionCost CallCost =
-            TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
-                                  TTI::TCK_RecipThroughput);
-        return IntrCost < CallCost;
-      };
+  auto NoCallIntrinsic = [this](const Instruction *I) {
+    const auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
+      return false;
+    if (II->isAssumeLikeIntrinsic())
+      return true;
+    IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
+    InstructionCost IntrCost =
+        TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
+    InstructionCost CallCost = TTI->getCallInstrCost(
+        nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
+    return IntrCost < CallCost;
+  };
 
+  SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
+      CheckedInstructions;
+  unsigned Budget = 0;
+  const unsigned BudgetLimit =
+      ScheduleRegionSizeBudget / VectorizableTree.size();
+  auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
+                                            Instruction *Last) {
+    assert(First->getParent() == Last->getParent() &&
+           "Expected instructions in same block.");
+    if (Last == First || Last->comesBefore(First))
+      return true;
+    BasicBlock::const_reverse_iterator InstIt =
+                                           ++First->getIterator().getReverse(),
+                                       PrevInstIt =
+                                           Last->getIterator().getReverse();
+    auto It = CheckedInstructions.find(Last);
+    if (It != CheckedInstructions.end()) {
+      const Instruction *Checked = It->second.getPointer();
+      if (Checked == First || Checked->comesBefore(First))
+        return It->second.getInt() != 0;
+      PrevInstIt = Checked->getIterator().getReverse();
+    }
+    SmallVector<const Instruction *> LastInstsInRange(1, Last);
+    while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
       // Debug information does not impact spill cost.
       // Vectorized calls, represented as vector intrinsics, do not impact spill
       // cost.
       if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
-          CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
-        NumCalls++;
+          CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
+        for (const Instruction *LastInst : LastInstsInRange)
+          CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
+        return false;
+      }
+      if (LastInstructions.contains(&*PrevInstIt))
+        LastInstsInRange.push_back(&*PrevInstIt);
 
       ++PrevInstIt;
+      ++Budget;
     }
-
-    if (NumCalls) {
-      SmallVector<Type *, 4> EntriesTypes;
-      for (const TreeEntry *TE : LiveEntries) {
-        auto *ScalarTy = TE->getMainOp()->getType();
-        auto It = MinBWs.find(TE);
-        if (It != MinBWs.end())
-          ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
-        EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
+    for (const Instruction *LastInst : LastInstsInRange)
+      CheckedInstructions.try_emplace(
+          LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
+          Budget <= BudgetLimit ? 1 : 0);
+    return Budget <= BudgetLimit;
+  };
+  auto AddCosts = [&](const TreeEntry *Op) {
+    Type *ScalarTy = Op->Scalars.front()->getType();
+    auto It = MinBWs.find(Op);
+    if (It != MinBWs.end())
+      ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
+    auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
+    Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
+    if (ScalarTy->isVectorTy()) {
+      // Handle revec dead vector instructions.
+      Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
+    }
+  };
+  SmallDenseMap<const BasicBlock *, bool> BlocksToCalls;
+  auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
+                               BasicBlock *OpParent) {
+    SmallVector<BasicBlock *> Worklist;
+    if (Pred)
+      Worklist.push_back(Pred);
+    else
+      Worklist.append(pred_begin(Root), pred_end(Root));
+    SmallPtrSet<const BasicBlock *, 16> Visited;
+    while (!Worklist.empty()) {
+      BasicBlock *BB = Worklist.pop_back_val();
+      if (BB == OpParent || !Visited.insert(BB).second)
+        continue;
+      if (auto It = BlocksToCalls.find(BB); It != BlocksToCalls.end()) {
+        Worklist.append(pred_begin(BB), pred_end(BB));
+        if (!It->second)
+          return false;
+        continue;
+      }
+      BlocksToCalls[BB] = false;
+      if (BB->sizeWithoutDebug() > ScheduleRegionSizeBudget)
+        return false;
+      Budget += BB->sizeWithoutDebug();
+      if (Budget > BudgetLimit)
+        return false;
+      if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
+                                          BB->getTerminator()))
+        return false;
+      BlocksToCalls[BB] = true;
+      Worklist.append(pred_begin(BB), pred_end(BB));
+    }
+    return true;
+  };
+  SmallVector<const TreeEntry *> LiveEntries(1, Root);
+  while (!LiveEntries.empty()) {
+    const TreeEntry *Entry = LiveEntries.pop_back_val();
+    SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
+    if (Operands.empty())
+      continue;
+    Instruction *LastInst = EntriesToLastInstruction.at(Entry);
+    for (const TreeEntry *Op : Operands) {
+      if (!Op->isGather())
+        LiveEntries.push_back(Op);
+      BasicBlock *Parent = Entry->getMainOp()->getParent();
+      if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
+          (Op->isGather() && allConstant(Op->Scalars)))
+        continue;
+      Budget = 0;
+      BasicBlock *Pred = Entry->getOpcode() == Instruction::PHI
+                             ? cast<PHINode>(Entry->getMainOp())
+                                   ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx)
+                             : nullptr;
+      BasicBlock *OpParent;
+      Instruction *OpLastInst;
+      if (Op->isGather()) {
+        assert(Entry->getOpcode() == Instruction::PHI &&
+               "Expected phi node only.");
+        OpParent = cast<PHINode>(Entry->getMainOp())
+                       ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
+        OpLastInst = OpParent->getTerminator();
+        for (Value *V : Op->Scalars) {
+          auto *Inst = dyn_cast<Instruction>(V);
+          if (!Inst)
+            continue;
+          if (isVectorized(V)) {
+            OpParent = Inst->getParent();
+            OpLastInst = Inst;
+            break;
+          }
+        }
+      } else {
+        OpLastInst = EntriesToLastInstruction.at(Op);
+        OpParent = Op->getMainOp()->getParent();
+      }
+      // Check the call instructions within the same basic blocks.
+      if (OpParent == Parent) {
+        if (Entry->getOpcode() == Instruction::PHI) {
+          if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
+            AddCosts(Op);
+          continue;
+        }
+        if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
+          AddCosts(Op);
+        continue;
+      }
+      // Check for call instruction in between blocks.
+      // 1. Check entry's block to the head.
+      if (Entry->getOpcode() != Instruction::PHI &&
+          !CheckForNonVecCallsInSameBlock(
+              &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
+              LastInst)) {
+        AddCosts(Op);
+        continue;
+      }
+      // 2. Check op's block from the end.
+      if (!CheckForNonVecCallsInSameBlock(OpLastInst,
+                                          OpParent->getTerminator())) {
+        AddCosts(Op);
+        continue;
+      }
+      // 3. Check the predecessors of entry's block till op's block.
+      if (!CheckPredecessors(Parent, Pred, OpParent)) {
+        AddCosts(Op);
+        continue;
       }
-      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
     }
-
-    Prev = TE;
   }
 
   return Cost;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 8cb620f870331..fc71643f6a51d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -1740,7 +1740,9 @@ entry:
 define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; CHECK-NEXT:    br label [[BAZ]]
 ; CHECK:       baz:
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; DEFAULT-LABEL: define void @f
 ; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
-; DEFAULT-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; DEFAULT-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; DEFAULT-NEXT:    br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; DEFAULT:       foo:
 ; DEFAULT-NEXT:    [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; DEFAULT-NEXT:    [[Z1:%.*]] = call float @fabsf(float [[Z0]])
 ; DEFAULT-NEXT:    br label [[BAZ]]
 ; DEFAULT:       baz:
-; DEFAULT-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; DEFAULT-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; DEFAULT-NEXT:    ret void
 ;
   %x0 = load i64, ptr %p
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
index b0c25bc4cc1f2..55978b61d54e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/spillcost.ll
@@ -7,7 +7,9 @@ declare void @g()
 define void @f0(i1 %c, ptr %p, ptr %q) {
 ; CHECK-LABEL: define void @f0(
 ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 8
 ; CHECK-NEXT:    br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
 ; CHECK:       [[FOO]]:
 ; CHECK-NEXT:    call void @g()
@@ -20,7 +22,9 @@ define void @f0(i1 %c, ptr %p, ptr %q) {
 ; CHECK-NEXT:    call void @g()
 ; CHECK-NEXT:    br label %[[BAZ]]
 ; CHECK:       [[BAZ]]:
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[Q]], align 8
+; CHECK-NEXT:    store i64 [[X0]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    store i64 [[X1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x0 = load i64, ptr %p
@@ -45,7 +49,7 @@ baz:
   ret void
 }
 
-; Shouldn't be vectorized
+; Should be vectorized - just one spill of TMP0
 define void @f1(i1 %c, ptr %p, ptr %q, ptr %r) {
 ; CHECK-LABEL: define void @f1(
 ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR0]] {

``````````

</details>


https://github.com/llvm/llvm-project/pull/129258