[llvm] [SLP]Exclude non-profitable subtrees. (PR #162018)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 5 07:51:42 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Initial support for SLP tree throttling. Trims non-profitable subtrees,
trying to maximize perf gains.
Does not support trees with gathered loads yet, since they are not quite
trees, but graphs. Analysis should be added later.
---
Patch is 179.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162018.diff
32 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+315-64)
- (modified) llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll (+4-5)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/div.ll (+11-27)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll (+29-63)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll (+10-8)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+8-6)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll (+7-9)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll (+62-61)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll (+4-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll (+10-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll (+10-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll (+32-19)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll (+32-19)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll (+9-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll (+9-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll (+243-83)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll (+9-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll (+10-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll (+8-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll (+14-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll (+12-11)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll (+137-26)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll (+8-10)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll (+14-18)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll (+10-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll (+29-48)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll (+18-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll (+12-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll (+8-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll (+8-10)
- (modified) llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll (+24-11)
- (modified) llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll (+44-17)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fedca65d241e8..b633dd4d9fdb0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2005,9 +2005,15 @@ class BoUpSLP {
/// holding live values over call sites.
InstructionCost getSpillCost();
+ /// Calculates the cost of the subtrees, trims non-profitable ones and returns
+ /// final cost.
+ InstructionCost
+ calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
+
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
+ InstructionCost getTreeCost(InstructionCost TreeCost,
+ ArrayRef<Value *> VectorizedVals = {},
InstructionCost ReductionCost = TTI::TCC_Free);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
@@ -2080,6 +2086,8 @@ class BoUpSLP {
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntries.clear();
+ DeletedNodes.clear();
+ TransformedToGatherNodes.clear();
OperandsToTreeEntry.clear();
ScalarsInSplitNodes.clear();
MustGather.clear();
@@ -4511,6 +4519,13 @@ class BoUpSLP {
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
+ /// List of deleted non-profitable nodes.
+ SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
+
+ /// List of nodes, transformed to gathered, with their conservative
+ /// gather/buildvector cost estimation.
+ SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
+
/// Maps the operand index and entry to the corresponding tree entry.
SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
OperandsToTreeEntry;
@@ -8697,7 +8712,9 @@ void BoUpSLP::buildExternalUses(
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+ if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
+ DeletedNodes.contains(Entry) ||
+ TransformedToGatherNodes.contains(Entry))
continue;
// For each lane:
@@ -8744,7 +8761,11 @@ void BoUpSLP::buildExternalUses(
// Skip in-tree scalars that become vectors
if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
- !UseEntries.empty()) {
+ !UseEntries.empty() &&
+ any_of(UseEntries, [this](const TreeEntry *UseEntry) {
+ return !DeletedNodes.contains(UseEntry) &&
+ !TransformedToGatherNodes.contains(UseEntry);
+ })) {
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in FoundLane will
// be used.
@@ -8752,6 +8773,9 @@ void BoUpSLP::buildExternalUses(
isa<LoadInst, StoreInst>(UserInst)) ||
isa<CallInst>(UserInst)) ||
all_of(UseEntries, [&](TreeEntry *UseEntry) {
+ if (DeletedNodes.contains(UseEntry) ||
+ TransformedToGatherNodes.contains(UseEntry))
+ return true;
return UseEntry->State == TreeEntry::ScatterVectorize ||
!doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -14208,7 +14232,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
unsigned EntryVF = E->getVectorFactor();
auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
- if (E->isGather()) {
+ if (E->isGather() || TransformedToGatherNodes.contains(E)) {
if (allConstant(VL))
return 0;
if (isa<InsertElementInst>(VL[0]))
@@ -15892,26 +15916,16 @@ static T *performExtractsShuffleAction(
return Prev;
}
-namespace {
-/// Data type for handling buildvector sequences with the reused scalars from
-/// other tree entries.
-template <typename T> struct ShuffledInsertData {
- /// List of insertelements to be replaced by shuffles.
- SmallVector<InsertElementInst *> InsertElements;
- /// The parent vectors and shuffle mask for the given list of inserts.
- MapVector<T, SmallVector<int>> ValueMasks;
-};
-} // namespace
-
-InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
- InstructionCost ReductionCost) {
- InstructionCost Cost = ReductionCost;
+InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
+ ArrayRef<Value *> VectorizedVals) {
+ SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
+ SmallPtrSet<Value *, 4> CheckedExtracts;
+ SmallPtrSet<const TreeEntry *, 4> GatheredLoadsNodes;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
-
- SmallPtrSet<Value *, 4> CheckedExtracts;
- for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree[I];
+ InstructionCost Cost = 0;
+ for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+ TreeEntry &TE = *Ptr;
// No need to count the cost for combined entries, they are combined and
// just skip their cost.
if (TE.State == TreeEntry::CombinedVectorize) {
@@ -15919,6 +15933,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
dbgs() << "SLP: Skipping cost for combined node that starts with "
<< *TE.Scalars[0] << ".\n";
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ NodesCosts.try_emplace(&TE);
continue;
}
if (TE.hasState() &&
@@ -15931,6 +15946,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
+ NodesCosts.try_emplace(&TE);
continue;
}
}
@@ -15942,11 +15958,202 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
+ NodesCosts.try_emplace(&TE, C);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
+ // Add gathered loads nodes to the set for later processing.
+ if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
+ TE.getOpcode() == Instruction::Load)
+ GatheredLoadsNodes.insert(&TE);
+ }
+ // Bail out if the cost threshold is negative and cost already below it.
+ if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
+ Cost < -SLPCostThreshold)
+ return Cost;
+ // Bail out, if gathered loads nodes are found.
+ // TODO: add analysis for gathered load to include their cost correctly into
+ // the related subtrees.
+ if (!GatheredLoadsNodes.empty())
+ return Cost;
+ SmallVector<std::pair<InstructionCost, SmallVector<unsigned>>> SubtreeCosts(
+ VectorizableTree.size());
+ for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+ TreeEntry &TE = *Ptr;
+ InstructionCost C = NodesCosts.at(&TE);
+ SubtreeCosts[TE.Idx].first += C;
+ const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
+ while (UserTE) {
+ SubtreeCosts[UserTE->Idx].first += C;
+ SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx);
+ UserTE = UserTE->UserTreeIndex.UserTE;
+ }
+ }
+ using CostIndicesTy =
+ std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
+ struct FirstGreater {
+ bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
+ return LHS.second.first < RHS.second.first ||
+ (LHS.second.first == RHS.second.first &&
+ LHS.first->Idx < RHS.first->Idx);
+ }
+ };
+ PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
+ Worklist;
+ for (const auto [Idx, P] : enumerate(SubtreeCosts))
+ Worklist.emplace(VectorizableTree[Idx].get(), P);
+
+ // Narrow store trees with non-profitable immediate values - exit.
+ if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < 4 &&
+ VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::Store &&
+ (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
+ return Cost;
+
+ bool Changed = false;
+ while (!Worklist.empty() && Worklist.top().second.first > 0) {
+ TreeEntry *TE = Worklist.top().first;
+ if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) {
+ Worklist.pop();
+ continue;
+ }
+
+ // Calculate the gather cost of the root node.
+ InstructionCost SubtreeCost = Worklist.top().second.first;
+ if (SubtreeCost < TE->Scalars.size()) {
+ Worklist.pop();
+ continue;
+ }
+ if (!TransformedToGatherNodes.empty()) {
+ for (unsigned Idx : Worklist.top().second.second) {
+ auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
+ if (It != TransformedToGatherNodes.end()) {
+ SubtreeCost -= SubtreeCosts[Idx].first;
+ SubtreeCost += It->second;
+ }
+ }
+ }
+ if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
+ Worklist.pop();
+ continue;
+ }
+ const unsigned Sz = TE->Scalars.size();
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ for (auto [Idx, V] : enumerate(TE->Scalars)) {
+ if (isConstant(V))
+ DemandedElts.clearBit(Idx);
+ }
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ Type *ScalarTy = getValueType(TE->Scalars.front());
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ const unsigned EntryVF = TE->getVectorFactor();
+ auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
+ InstructionCost GatherCost = ::getScalarizationOverhead(
+ *TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true, /*Extract=*/false, CostKind);
+ SmallVector<int> Mask;
+ if (!TE->ReorderIndices.empty() &&
+ TE->State != TreeEntry::CompressVectorize &&
+ (TE->State != TreeEntry::StridedVectorize ||
+ !isReverseOrder(TE->ReorderIndices))) {
+ SmallVector<int> NewMask;
+ if (TE->getOpcode() == Instruction::Store) {
+ // For stores the order is actually a mask.
+ NewMask.resize(TE->ReorderIndices.size());
+ copy(TE->ReorderIndices, NewMask.begin());
+ } else {
+ inversePermutation(TE->ReorderIndices, NewMask);
+ }
+ ::addMask(Mask, NewMask);
+ }
+ if (!TE->ReuseShuffleIndices.empty())
+ ::addMask(Mask, TE->ReuseShuffleIndices);
+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
+ GatherCost +=
+ ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
+ // If all scalars are reused in gather node(s) or other vector nodes, there
+ // might be extra cost for inserting them.
+ if (all_of(TE->Scalars, [&](Value *V) {
+ return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
+ isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
+ }))
+ GatherCost *= 2;
+ // Erase subtree if it is non-profitable.
+ if (SubtreeCost > GatherCost) {
+ // If the remaining tree is just a buildvector - exit, it will cause
+ // enless attempts to vectorize.
+ if (VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+ TE->Idx == 1)
+ return InstructionCost::getInvalid();
+
+ LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
+ << TE->Idx << " with cost "
+ << Worklist.top().second.first << " and gather cost "
+ << GatherCost << ".\n");
+ if (TE->UserTreeIndex) {
+ TransformedToGatherNodes.try_emplace(TE, GatherCost);
+ NodesCosts.erase(TE);
+ } else {
+ DeletedNodes.insert(TE);
+ TransformedToGatherNodes.erase(TE);
+ NodesCosts.erase(TE);
+ }
+ for (unsigned Idx : Worklist.top().second.second) {
+ TreeEntry &ChildTE = *VectorizableTree[Idx];
+ DeletedNodes.insert(&ChildTE);
+ TransformedToGatherNodes.erase(&ChildTE);
+ NodesCosts.erase(&ChildTE);
+ }
+ Changed = true;
+ }
+ Worklist.pop();
+ }
+ if (!Changed)
+ return SubtreeCosts.front().first;
+
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ if (DeletedNodes.contains(TE.get()))
+ continue;
+ if (TransformedToGatherNodes.contains(TE.get()) && !TE->UserTreeIndex) {
+ assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
+ continue;
+ }
+ if (!NodesCosts.contains(TE.get())) {
+ InstructionCost C =
+ getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
+ NodesCosts.try_emplace(TE.get(), C);
+ }
}
+ LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
+ Cost = 0;
+ for (const auto &P : NodesCosts){
+ Cost += P.second;
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
+ << shortBundleName(P.first->Scalars, P.first->Idx) << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ }
+ return Cost;
+}
+
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+template <typename T> struct ShuffledInsertData {
+ /// List of insertelements to be replaced by shuffles.
+ SmallVector<InsertElementInst *> InsertElements;
+ /// The parent vectors and shuffle mask for the given list of inserts.
+ MapVector<T, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
+InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
+ ArrayRef<Value *> VectorizedVals,
+ InstructionCost ReductionCost) {
+ InstructionCost Cost = TreeCost + ReductionCost;
+
if (Cost >= -SLPCostThreshold &&
none_of(ExternalUses, [](const ExternalUser &EU) {
return isa_and_nonnull<InsertElementInst>(EU.User);
@@ -16243,8 +16450,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
for (Value *V : ScalarOpsFromCasts) {
ExternalUsesAsOriginalScalar.insert(V);
if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
- ExternalUses.emplace_back(V, nullptr, *TEs.front(),
- TEs.front()->findLaneForValue(V));
+ const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
+ return TransformedToGatherNodes.contains(TE) ||
+ DeletedNodes.contains(TE);
+ });
+ if (It != TEs.end()) {
+ const TreeEntry *UserTE = *It;
+ ExternalUses.emplace_back(V, nullptr, *UserTE,
+ UserTE->findLaneForValue(V));
+ }
}
}
// Add reduced value cost, if resized.
@@ -16710,8 +16924,22 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
- for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
- if (TEPtr == TE || TEPtr->Idx == 0)
+ SmallVector<const TreeEntry *> GatherNodes(
+ ValueToGatherNodes.lookup(V).takeVector());
+ if (TransformedToGatherNodes.contains(TE)) {
+ for (TreeEntry *E : getSplitTreeEntries(V)) {
+ if (TE == E || !TransformedToGatherNodes.contains(E))
+ continue;
+ GatherNodes.push_back(E);
+ }
+ for (TreeEntry *E : getTreeEntries(V)) {
+ if (TE == E || !TransformedToGatherNodes.contains(E))
+ continue;
+ GatherNodes.push_back(E);
+ }
+ }
+ for (const TreeEntry *TEPtr : GatherNodes) {
+ if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
continue;
assert(any_of(TEPtr->Scalars,
[&](Value *V) { return GatheredScalars.contains(V); }) &&
@@ -16787,8 +17015,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
VToTEs.insert(TEPtr);
}
if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
- const auto *It = find_if(
- VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
+ const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
+ return MTE != TE && MTE != TEUseEI.UserTE &&
+ !DeletedNodes.contains(MTE);
+ });
if (It != VTEs.end()) {
const TreeEntry *VTE = *It;
if (none_of(TE->CombinedEntriesWithIndices,
@@ -16804,28 +17034,34 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
}
}
if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
- const TreeEntry *VTE = VTEs.front();
- if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
- VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
- VTEs = VTEs.drop_front();
- // Iterate through all vectorized nodes.
- const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
- return MTE->State == TreeEntry::Vectorize;
- });
- if (MIt == VTEs.end())
- continue;
- VTE = *MIt;
- }
- if (none_of(TE->CombinedEntriesWithIndices,
- [&](const auto &P) { return P.first == VTE->Idx; })) {
- Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
- if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
- continue;
+ const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
+ return TE != MainTE && !DeletedNodes.contains(TE) &&
+ !TransformedToGatherNodes.contains(TE);
+ });
+ if (It != VTEs.end()) {
+ const TreeEntry *VTE = *It;
+ if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
+ VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
+ VTEs = VTEs.drop_front();
+ // Iterate through all vectorized nodes.
+ const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
+ return MTE->State == TreeEntry::Vectorize;
+ });
+ if (MIt == VTEs.end())
+ continue;
+ VTE = *MIt;
+ }
+ if (none_of(TE->CombinedEntriesWithIndices,
+ [&](const auto &P) { return P.first == VTE->Idx; })) {
+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+ if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+ continue;
+ }
+ // The node is reused - exit.
+ if (CheckAndUseSameNode(VTE))
+ break;
+ VToTEs.insert(VTE);
}
- // The node is reused - exit.
- if (CheckAndUseSameNode(VTE))
- break;
- VToTEs.insert(VTE);
}
if (VToTEs.empty())
continue;
@@ -17658,7 +17894,12 @@ Value *BoUpSLP::gather(
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (isa<Instruction>(V)) {
- if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
+ ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
+ const auto *It = find_if(Entries, [&](const TreeEntry *E) {
+ return !TransformedToGatherNodes.contains(E) &&
+ !DeletedNodes.contains(E);
+ });
+ if (It != Entries.end()) {
// Find which lane we need to extract.
User *UserOp = nullptr;
if (Scalar != V) {
@@ -17690,8 +17931,8 @@ Value *BoUpSLP::gather(
UserOp = InsElt;
}
if (UserOp) {
- unsigned FoundLane = Entries.front()->findLaneForValue(V);
- ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
+ unsigned FoundLane = (*It)->findLaneForValue(V);
+ ExternalUses.emplace_back(V, User...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/162018
More information about the llvm-commits
mailing list