[llvm] [SLP]Improve vectorization of gathered loads. (PR #89129)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 12:46:52 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-systemz
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
When building the vectorization graph, the compiler may end up with the
consecutive loads in the different branches, which end up to be
gathered. We can scan these loads and try to load them as final
vectorized load and then reshuffle between the branches to avoid extra
scalar loads in the code.
Part of D57059
Differential Revision: https://reviews.llvm.org/D105986
---
Patch is 430.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89129.diff
53 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+643-85)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll (+108-123)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll (+147-151)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll (+8-14)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll (+25-2)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll (+21-22)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+180-204)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll (+18-26)
- (modified) llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll (+6-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll (+11-17)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll (+13-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/addsub.ll (+16-26)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll (+15-15)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll (+12-10)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll (+22-28)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll (+18-35)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll (+18-35)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll (+19-20)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll (+64-61)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll (+20-16)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/phi.ll (+24-19)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (+177-201)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (+177-201)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll (+54-108)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll (+14-13)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll (+15-15)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll (+7-17)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll (+5-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll (+13-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll (+8-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll (+16-16)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll (+17-21)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/return.ll (+7-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll (+9-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll (+7-11)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll (+4-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll (+7-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (+33-66)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll (+15-34)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll (+10-14)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll (+2-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/supernode.ll (+30-51)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll (+6-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll (+20-19)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll (+20-19)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll (+15-16)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll (+13-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll (+7-7)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 806e8085038b35..6730d0c4db7fea 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1133,6 +1133,8 @@ class BoUpSLP {
MultiNodeScalars.clear();
MustGather.clear();
EntryToLastInstruction.clear();
+ GatheredLoads.clear();
+ GatheredLoadsEntriesFirst = NoGatheredLoads;
ExternalUses.clear();
ExternalUsesAsGEPs.clear();
for (auto &Iter : BlocksSchedules) {
@@ -1170,8 +1172,9 @@ class BoUpSLP {
/// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
- bool TopToBottom);
+ std::optional<OrdersType> getReorderingData(
+ const TreeEntry &TE, bool TopToBottom,
+ DenseMap<const TreeEntry *, TreeEntry *> &ScatterVectorizeToReorder);
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -2558,6 +2561,11 @@ class BoUpSLP {
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree(bool ForReduction) const;
+ /// Run through the list of all gathered loads in the graph and try to find
+ /// vector loads/masked gathers instead of regular gathers. Later these loads
+ /// are reshufled to build final gathered nodes.
+ void tryToVectorizeGatheredLoads();
+
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
@@ -3010,6 +3018,14 @@ class BoUpSLP {
CastMaxMinBWSizes =
std::make_pair(std::numeric_limits<unsigned>::max(), 1);
MustGather.insert(VL.begin(), VL.end());
+ if (GatheredLoadsEntriesFirst == NoGatheredLoads ||
+ Last->Idx < GatheredLoadsEntriesFirst || UserTreeIdx.UserTE ||
+ S.getOpcode() != Instruction::Load) {
+ // Build a map for gathered scalars to the nodes where they are used.
+ for (Value *V : VL)
+ if (!isConstant(V))
+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+ }
}
if (UserTreeIdx.UserTE) {
@@ -3085,6 +3101,14 @@ class BoUpSLP {
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
ValueToGatherNodesMap ValueToGatherNodes;
+ /// A list of loads to be gathered during the vectorization process. We can
+ /// try to vectorize them at the end, if profitable.
+ SmallVector<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads;
+
+ /// The index of the first gathered load entry in the VectorizeTree.
+ constexpr static int NoGatheredLoads = -1;
+ int GatheredLoadsEntriesFirst = NoGatheredLoads;
+
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
@@ -4604,12 +4628,17 @@ static bool areTwoInsertFromSameBuildVector(
return false;
}
-std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(
+ const TreeEntry &TE, bool TopToBottom,
+ DenseMap<const TreeEntry *, TreeEntry *> &ScatterVectorizeToReorder) {
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
if (TE.isNonPowOf2Vec())
return std::nullopt;
+ if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+ TE.Idx >= GatheredLoadsEntriesFirst && TE.UserTreeIndices.empty() &&
+ &TE != VectorizableTree.front().get())
+ return std::nullopt;
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -4781,6 +4810,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
}
+ bool LoadsScatterVectorize = false;
if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
// TODO: add analysis of other gather nodes with extractelement
@@ -4843,8 +4873,32 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
if (TE.Scalars.size() >= 4)
if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
return Order;
- if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ if (TE.getOpcode() == Instruction::Load) {
+ SmallVector<Value *> PointerOps;
+ OrdersType CurrentOrder;
+ LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
+ CurrentOrder, PointerOps);
+ if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+ return std::move(CurrentOrder);
+ LoadsScatterVectorize = Res == LoadsState::ScatterVectorize;
+ }
+ if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) {
+ if (LoadsScatterVectorize) {
+ if (TreeEntry *ScatterVectorTE = getTreeEntry(TE.Scalars.front());
+ ScatterVectorTE &&
+ ScatterVectorTE->Idx >= GatheredLoadsEntriesFirst &&
+ ScatterVectorTE->UserTreeIndices.empty() &&
+ ScatterVectorTE->State == TreeEntry::ScatterVectorize &&
+ ScatterVectorTE->Scalars.size() == TE.Scalars.size() &&
+ all_of(TE.Scalars, [&](Value *V) {
+ return getTreeEntry(V) == ScatterVectorTE;
+ })) {
+ ScatterVectorizeToReorder.try_emplace(&TE, ScatterVectorTE);
+ return std::nullopt;
+ }
+ }
return CurrentOrder;
+ }
}
return std::nullopt;
}
@@ -4930,73 +4984,83 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
+ // Nodes with loads masked gathering built out of gathered loads that should
+ // be reordered to avoid extra shuffles.
+ DenseMap<const TreeEntry *, TreeEntry *> ScatterVectorizeToReorder;
// Find all reorderable nodes with the given VF.
// Currently the are vectorized stores,loads,extracts + some gathering of
// extracts.
- for_each(VectorizableTree, [&, &TTIRef = *TTI](
- const std::unique_ptr<TreeEntry> &TE) {
- // Look for external users that will probably be vectorized.
- SmallVector<OrdersType, 1> ExternalUserReorderIndices =
- findExternalStoreUsersReorderIndices(TE.get());
- if (!ExternalUserReorderIndices.empty()) {
- VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
- ExternalUserReorderMap.try_emplace(TE.get(),
- std::move(ExternalUserReorderIndices));
- }
-
- // Patterns like [fadd,fsub] can be combined into a single instruction in
- // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
- // to take into account their order when looking for the most used order.
- if (TE->isAltShuffle()) {
- VectorType *VecTy =
- FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
- unsigned Opcode0 = TE->getOpcode();
- unsigned Opcode1 = TE->getAltOpcode();
- // The opcode mask selects between the two opcodes.
- SmallBitVector OpcodeMask(TE->Scalars.size(), false);
- for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
- if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
- OpcodeMask.set(Lane);
- // If this pattern is supported by the target then we consider the order.
- if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
- VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
- AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
- }
- // TODO: Check the reverse order too.
- }
+ for_each(
+ ArrayRef(VectorizableTree)
+ .drop_back(GatheredLoadsEntriesFirst == NoGatheredLoads
+ ? 0
+ : VectorizableTree.size() - GatheredLoadsEntriesFirst),
+ [&, &TTIRef = *TTI](const std::unique_ptr<TreeEntry> &TE) {
+ // Look for external users that will probably be vectorized.
+ SmallVector<OrdersType, 1> ExternalUserReorderIndices =
+ findExternalStoreUsersReorderIndices(TE.get());
+ if (!ExternalUserReorderIndices.empty()) {
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+ ExternalUserReorderMap.try_emplace(
+ TE.get(), std::move(ExternalUserReorderIndices));
+ }
- if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/true)) {
- // Do not include ordering for nodes used in the alt opcode vectorization,
- // better to reorder them during bottom-to-top stage. If follow the order
- // here, it causes reordering of the whole graph though actually it is
- // profitable just to reorder the subgraph that starts from the alternate
- // opcode vectorization node. Such nodes already end-up with the shuffle
- // instruction and it is just enough to change this shuffle rather than
- // rotate the scalars for the whole graph.
- unsigned Cnt = 0;
- const TreeEntry *UserTE = TE.get();
- while (UserTE && Cnt < RecursionMaxDepth) {
- if (UserTE->UserTreeIndices.size() != 1)
- break;
- if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
- return EI.UserTE->State == TreeEntry::Vectorize &&
- EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
- }))
- return;
- UserTE = UserTE->UserTreeIndices.back().UserTE;
- ++Cnt;
- }
- VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
- if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
- !TE->ReuseShuffleIndices.empty())
- GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
- if (TE->State == TreeEntry::Vectorize &&
- TE->getOpcode() == Instruction::PHI)
- PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
- }
- });
+ // Patterns like [fadd,fsub] can be combined into a single instruction
+ // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
+ // need to take into account their order when looking for the most used
+ // order.
+ if (TE->isAltShuffle()) {
+ VectorType *VecTy = FixedVectorType::get(TE->Scalars[0]->getType(),
+ TE->Scalars.size());
+ unsigned Opcode0 = TE->getOpcode();
+ unsigned Opcode1 = TE->getAltOpcode();
+ // The opcode mask selects between the two opcodes.
+ SmallBitVector OpcodeMask(TE->Scalars.size(), false);
+ for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
+ if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
+ OpcodeMask.set(Lane);
+ // If this pattern is supported by the target then we consider the
+ // order.
+ if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+ AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
+ }
+ // TODO: Check the reverse order too.
+ }
+
+ if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+ *TE, /*TopToBottom=*/true, ScatterVectorizeToReorder)) {
+ // Do not include ordering for nodes used in the alt opcode
+ // vectorization, better to reorder them during bottom-to-top stage.
+ // If follow the order here, it causes reordering of the whole graph
+ // though actually it is profitable just to reorder the subgraph that
+ // starts from the alternate opcode vectorization node. Such nodes
+ // already end-up with the shuffle instruction and it is just enough
+ // to change this shuffle rather than rotate the scalars for the whole
+ // graph.
+ unsigned Cnt = 0;
+ const TreeEntry *UserTE = TE.get();
+ while (UserTE && Cnt < RecursionMaxDepth) {
+ if (UserTE->UserTreeIndices.size() != 1)
+ break;
+ if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
+ return EI.UserTE->State == TreeEntry::Vectorize &&
+ EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
+ }))
+ return;
+ UserTE = UserTE->UserTreeIndices.back().UserTE;
+ ++Cnt;
+ }
+ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+ if (!(TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::StridedVectorize) ||
+ !TE->ReuseShuffleIndices.empty())
+ GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+ if (TE->State == TreeEntry::Vectorize &&
+ TE->getOpcode() == Instruction::PHI)
+ PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
+ }
+ });
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
@@ -5126,6 +5190,10 @@ void BoUpSLP::reorderTopToBottom() {
});
// Do an actual reordering, if profitable.
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ // Do not reorder gathered loads.
+ if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+ TE->Idx >= GatheredLoadsEntriesFirst)
+ continue;
// Just do the reordering for the nodes with the given VF.
if (TE->Scalars.size() != VF) {
if (TE->ReuseShuffleIndices.size() == VF) {
@@ -5242,12 +5310,20 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
SmallVector<TreeEntry *> NonVectorized;
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ // Nodes with loads masked gathering built out of gathered loads that should
+ // be reordered to avoid extra shuffles.
+ DenseMap<const TreeEntry *, TreeEntry *> ScatterVectorizeToReorder;
+ for (const std::unique_ptr<TreeEntry> &TE :
+ ArrayRef(VectorizableTree)
+ .drop_back(GatheredLoadsEntriesFirst == NoGatheredLoads
+ ? 0
+ : VectorizableTree.size() -
+ GatheredLoadsEntriesFirst)) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
- if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/false)) {
+ if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+ *TE, /*TopToBottom=*/false, ScatterVectorizeToReorder)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
@@ -5284,6 +5360,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// search. The graph currently does not provide this dependency directly.
for (EdgeInfo &EI : TE->UserTreeIndices) {
TreeEntry *UserTE = EI.UserTE;
+ if (!UserTE)
+ continue;
auto It = Users.find(UserTE);
if (It == Users.end())
It = Users.insert({UserTE, {}}).first;
@@ -5300,6 +5378,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Data1.first->Idx > Data2.first->Idx;
});
for (auto &Data : UsersVec) {
+ if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+ Data.first->Idx >= GatheredLoadsEntriesFirst)
+ llvm_unreachable("Gathered loads nodes must not be reordered.");
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
@@ -5327,7 +5408,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
const auto Order = [&]() -> const OrdersType {
if (OpTE->State == TreeEntry::NeedToGather ||
!OpTE->ReuseShuffleIndices.empty())
- return getReorderingData(*OpTE, /*TopToBottom=*/false)
+ return getReorderingData(*OpTE, /*TopToBottom=*/false,
+ ScatterVectorizeToReorder)
.value_or(OrdersType(1));
return OpTE->ReorderIndices;
}();
@@ -5366,7 +5448,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return true;
if (TE->State == TreeEntry::NeedToGather) {
if (GathersToOrders.contains(TE))
- return !getReorderingData(*TE, /*TopToBottom=*/false)
+ return !getReorderingData(*TE, /*TopToBottom=*/false,
+ ScatterVectorizeToReorder)
.value_or(OrdersType(1))
.empty();
return true;
@@ -5515,6 +5598,71 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
VectorizableTree.front()->ReuseShuffleIndices.empty())
VectorizableTree.front()->ReorderIndices.clear();
+ // Reorder masked gather nodes built out of gathered loads.
+ SmallPtrSet<const TreeEntry *, 4> Processed;
+ for (const auto &SVLoadsData : ScatterVectorizeToReorder) {
+ if (!Processed.insert(SVLoadsData.second).second)
+ continue;
+ std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(*SVLoadsData.first);
+ assert(CurrentOrder && "Expected order.");
+ if (CurrentOrder->empty() || !SVLoadsData.second->UserTreeIndices.empty())
+ continue;
+ SmallVector<TreeEntry *> Operands;
+ SmallVector<const TreeEntry *> Worklist(1, SVLoadsData.second);
+ while (!Worklist.empty()) {
+ const TreeEntry *CurrentEntry = Worklist.pop_back_val();
+ for (unsigned I = 0, E = CurrentEntry->getNumOperands(); I < E; ++I) {
+ if (CurrentEntry->getOpcode() == Instruction::ExtractElement)
+ continue;
+ if (CurrentEntry->getOpcode() == Instruction::Call) {
+ auto *CI = cast<CallInst>(CurrentEntry->getMainOp());
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (I >= CI->arg_size() ||
+ isVectorIntrinsicWithScalarOpAtArg(ID, I))
+ continue;
+ }
+ const TreeEntry *Op = getOperandEntry(CurrentEntry, I);
+ if (Op->ReuseShuffleIndices.empty())
+ Worklist.push_back(Op);
+ Operands.push_back(const_cast<TreeEntry *>(Op));
+ }
+ }
+ // If there are several users of the pointers tree entry, no need to
+ // reorder the scatter vectorize node, still have same number of shuffles.
+ if (any_of(Operands, [](const TreeEntry *TE) {
+ return TE->UserTreeIndices.size() > 1;
+ }))
+ continue;
+ // Reorder related masked gather node and its operands.
+ SmallVector<int> Mask(CurrentOrder->size(), PoisonMaskElem);
+ unsigned E = CurrentOrder->size();
+ transform(*CurrentOrder, Mask.begin(), [E](unsigned I) {
+ return I < E ? static_cast<int>(I) : PoisonMaskElem;
+ });
+ for (TreeEntry *OpTE : Operands) {
+ if (!OpTE->ReuseShuffleIndices.empty()) {
+ reorderReuses(OpTE->ReuseShuffleIndices, Mask);
+ } else if (OpTE->State == TreeEntry::NeedToGather) {
+ if (OpTE->ReorderIndices.empty())
+ reorderScalars(OpTE->Scalars, Mask);
+ else
+ reorderOrder(OpTE->ReorderIndices, Mask);
+ } else {
+ OpTE->reorderOperands(Mask);
+ if (OpTE->ReorderIndices.empty())
+ reorderScalars(OpTE->Scalars, Mask);
+ else
+ reorderOrder(OpTE->ReorderIndices, Mask);
+ }
+ }
+ SVLoadsData.second->reorderOperands(Mask);
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/89129
More information about the llvm-commits
mailing list