[llvm] [SLP]Represent SLP graph as a tree (PR #126771)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 10:03:07 PST 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/126771
>From f2e3a1f7c4d481ed935eaec1e05d08ce8b140bc8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 11 Feb 2025 17:58:20 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 633 +++++++++++-------
.../SLPVectorizer/AArch64/horizontal.ll | 4 +-
.../SLPVectorizer/AArch64/remarks.ll | 2 +-
.../AArch64/reorder-fmuladd-crash.ll | 9 +-
.../RISCV/remarks_cmp_sel_min_max.ll | 8 +-
.../SystemZ/SLP-cmp-cost-query.ll | 2 +-
.../cmp-after-intrinsic-call-minbitwidth.ll | 12 +-
.../X86/full-matched-bv-with-subvectors.ll | 18 +-
.../X86/gather-node-same-as-vect-but-order.ll | 15 +-
.../SLPVectorizer/X86/geps-non-pow-2.ll | 6 +-
.../X86/matching-gather-nodes-phi-users.ll | 2 +-
.../X86/minbitwidth-icmp-to-trunc.ll | 17 +-
.../X86/minbw-node-used-twice.ll | 6 +-
.../X86/reused-mask-with-poison-index.ll | 18 +-
.../SLPVectorizer/X86/shrink_after_reorder.ll | 5 +-
.../X86/slp-schedule-use-order.ll | 6 +-
.../subvector-minbitwidth-unsigned-value.ll | 8 +-
17 files changed, 463 insertions(+), 308 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1c08077126db..6f9c62c6218b2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1409,12 +1409,6 @@ class BoUpSLP {
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
- /// Returns whether the root node has in-tree uses.
- bool doesRootHaveInTreeUses() const {
- return !VectorizableTree.empty() &&
- !VectorizableTree.front()->UserTreeIndices.empty();
- }
-
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
assert(!VectorizableTree.empty() && "No graph to get the first node from");
@@ -1524,7 +1518,12 @@ class BoUpSLP {
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
- std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// \param TopToBottom If true, used for the whole tree rotation, false - for
+ /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
+ /// node might be ignored.
+ std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
+ bool TopToBottom,
+ bool IgnoreReorder);
/// Sort loads into increasing pointers offsets to allow greater clustering.
std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
@@ -1536,8 +1535,14 @@ class BoUpSLP {
/// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
- bool TopToBottom);
+ /// \param IgnoreReorder true, if the root node order can be ignored.
+ std::optional<OrdersType>
+ getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
+
+ /// Checks if it is profitable to reorder the current tree.
+ /// If the tree does not contain many profitable reordable nodes, better to
+ /// skip it to save compile time.
+ bool isProfitableToReorder() const;
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1680,6 +1685,8 @@ class BoUpSLP {
bool operator == (const EdgeInfo &Other) const {
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
}
+
+ operator bool() const { return UserTE != nullptr; }
};
/// A helper class used for scoring candidates for two consecutive lanes.
@@ -2999,8 +3006,10 @@ class BoUpSLP {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [&](Value *V) {
+ if (!isa<Instruction>(V))
+ return false;
for (TreeEntry *E : getTreeEntries(V)) {
- if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+ if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) {
TE = E;
return true;
}
@@ -3031,7 +3040,7 @@ class BoUpSLP {
/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
- /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+ /// \returns the graph entry for the \p Idx operand of the \p E entry.
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
/// Gets the root instruction for the given node. If the node is a strided
@@ -3070,10 +3079,15 @@ class BoUpSLP {
/// Returns vectorized operand node, that matches the order of the scalars
/// operand number \p NodeIdx in entry \p E.
- TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
- const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
- unsigned NodeIdx) const {
- return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
+ TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S);
+ const TreeEntry *
+ getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S) const {
+ return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx,
+ VL, S);
}
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
@@ -3249,9 +3263,8 @@ class BoUpSLP {
}
bool isOperandGatherNode(const EdgeInfo &UserEI) const {
- return isGather() && !UserTreeIndices.empty() &&
- UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
- UserTreeIndices.front().UserTE == UserEI.UserTE;
+ return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx &&
+ UserTreeIndex.UserTE == UserEI.UserTE;
}
/// \returns true if current entry has same operands as \p TE.
@@ -3335,7 +3348,7 @@ class BoUpSLP {
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
- SmallVector<EdgeInfo, 1> UserTreeIndices;
+ EdgeInfo UserTreeIndex;
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
@@ -3559,9 +3572,9 @@ class BoUpSLP {
for (unsigned ReorderIdx : ReorderIndices)
dbgs() << ReorderIdx << ", ";
dbgs() << "\n";
- dbgs() << "UserTreeIndices: ";
- for (const auto &EInfo : UserTreeIndices)
- dbgs() << EInfo << ", ";
+ dbgs() << "UserTreeIndex: ";
+ if (UserTreeIndex)
+ dbgs() << UserTreeIndex;
dbgs() << "\n";
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
@@ -3707,7 +3720,7 @@ class BoUpSLP {
}
if (UserTreeIdx.UserTE)
- Last->UserTreeIndices.push_back(UserTreeIdx);
+ Last->UserTreeIndex = UserTreeIdx;
return Last;
}
@@ -4463,11 +4476,11 @@ template <> struct GraphTraits<BoUpSLP *> {
}
static ChildIteratorType child_begin(NodeRef N) {
- return {N->UserTreeIndices.begin(), N->Container};
+ return {&N->UserTreeIndex, N->Container};
}
static ChildIteratorType child_end(NodeRef N) {
- return {N->UserTreeIndices.end(), N->Container};
+ return {&N->UserTreeIndex + 1, N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -4632,7 +4645,8 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
+ bool TopToBottom, bool IgnoreReorder) {
assert(TE.isGather() && "Expected gather node only.");
// Try to find subvector extract/insert patterns and reorder only such
// patterns.
@@ -4658,6 +4672,26 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
if (GatherShuffles.size() == 1 &&
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
Entries.front().front()->isSame(TE.Scalars)) {
+ // If the full matched node in whole tree rotation - no need to consider the
+ // matching order, rotating the whole tree.
+ if (TopToBottom)
+ return std::nullopt;
+ // No need to keep the order for the same user node.
+ if (Entries.front().front()->UserTreeIndex.UserTE ==
+ TE.UserTreeIndex.UserTE)
+ return std::nullopt;
+ // No need to keep the order for the matched root node, if it can be freely
+ // reordered.
+ if (!IgnoreReorder && Entries.front().front()->Idx == 0)
+ return std::nullopt;
+ // If shuffling 2 elements only and the matching node has reverse reuses -
+ // no need to count order, both work fine.
+ if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
+ TE.getVectorFactor() == 2 && Mask.size() == 2 &&
+ any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
+ [](const auto &P) { return P.value() % 2 != P.index() % 2; }))
+ return std::nullopt;
+
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
@@ -5557,7 +5591,8 @@ static bool areTwoInsertFromSameBuildVector(
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
+ bool IgnoreReorder) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -5577,7 +5612,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
unsigned Sz = TE.Scalars.size();
if (TE.isGather()) {
if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE)) {
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
SmallVector<int> Mask;
fixupOrderingIndices(*CurrentOrder);
inversePermutation(*CurrentOrder, Mask);
@@ -5680,10 +5715,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
- any_of(TE.UserTreeIndices,
- [](const EdgeInfo &EI) {
- return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
- }) &&
+ (!TE.UserTreeIndex ||
+ !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
if ((TE.State == TreeEntry::Vectorize ||
@@ -5873,7 +5906,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
// has been auditted for correctness with non-power-of-two vectors.
if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
return CurrentOrder;
}
return std::nullopt;
@@ -5943,6 +5977,67 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
}
}
+bool BoUpSLP::isProfitableToReorder() const {
+ constexpr unsigned TinyVF = 2;
+ constexpr unsigned TinyTree = 10;
+ if (VectorizableTree.size() <= TinyTree)
+ return true;
+ if (VectorizableTree.front()->hasState() &&
+ !VectorizableTree.front()->isGather() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.front()->getOpcode() == Instruction::PHI ||
+ (VectorizableTree.front()->getVectorFactor() == TinyVF &&
+ (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
+ VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
+ VectorizableTree.front()->ReorderIndices.empty()) {
+ constexpr unsigned PhiOpsLimit = 12;
+ // Check if the tree has only single store and single (unordered) load node,
+ // other nodes are phis or geps/binops, combined with phis, and/orsingle
+ // gather load node
+ bool HasPhis = false;
+ if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ VectorizableTree.front()->Scalars.size() == TinyVF &&
+ VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+ return false;
+ bool HasLoad = true;
+ unsigned GatherLoads = 0;
+ for (const std::unique_ptr<TreeEntry> &TE :
+ ArrayRef(VectorizableTree).drop_front()) {
+ if (!TE->hasState()) {
+ if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
+ all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
+ continue;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
+ continue;
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
+ if (!TE->isGather()) {
+ HasLoad = false;
+ continue;
+ }
+ if (HasLoad)
+ return true;
+ ++GatherLoads;
+ if (GatherLoads >= 2)
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::GetElementPtr ||
+ Instruction::isBinaryOp(TE->getOpcode()))
+ continue;
+ if (TE->getOpcode() != Instruction::PHI)
+ return true;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ TE->getNumOperands() > PhiOpsLimit)
+ return false;
+ HasPhis = true;
+ }
+ return !HasPhis;
+ }
+ return true;
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -5991,8 +6086,12 @@ void BoUpSLP::reorderTopToBottom() {
// TODO: Check the reverse order too.
}
+ bool IgnoreReorder =
+ !UserIgnoreList && VectorizableTree.front()->hasState() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
+ VectorizableTree.front()->getOpcode() == Instruction::Store);
if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/true)) {
+ getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
// here, it causes reordering of the whole graph though actually it is
@@ -6003,14 +6102,13 @@ void BoUpSLP::reorderTopToBottom() {
unsigned Cnt = 0;
const TreeEntry *UserTE = TE.get();
while (UserTE && Cnt < RecursionMaxDepth) {
- if (UserTE->UserTreeIndices.size() != 1)
+ if (!UserTE->UserTreeIndex)
break;
- if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
- return EI.UserTE->State == TreeEntry::Vectorize &&
- EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
- }))
+ if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+ UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
+ UserTE->UserTreeIndex.UserTE->Idx != 0)
return;
- UserTE = UserTE->UserTreeIndices.back().UserTE;
+ UserTE = UserTE->UserTreeIndex.UserTE;
++Cnt;
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
@@ -6156,12 +6254,10 @@ void BoUpSLP::reorderTopToBottom() {
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
- assert(all_of(TE->UserTreeIndices,
- [VF, &TE](const EdgeInfo &EI) {
- return EI.UserTE->Scalars.size() == VF ||
- EI.UserTE->Scalars.size() ==
- TE->Scalars.size();
- }) &&
+ assert((!TE->UserTreeIndex ||
+ TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
+ TE->UserTreeIndex.UserTE->Scalars.size() ==
+ TE->Scalars.size()) &&
"All users must be of VF size.");
if (SLPReVec) {
assert(SLPReVec && "Only supported by REVEC.");
@@ -6169,15 +6265,11 @@ void BoUpSLP::reorderTopToBottom() {
// because ShuffleVectorInst supports only a limited set of
// patterns). Only do reorderNodeWithReuses if all of the users are
// not ShuffleVectorInst.
- if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
- }))
+ if (isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
continue;
- assert(none_of(TE->UserTreeIndices,
- [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(
- EI.UserTE->getMainOp());
- }) &&
+ assert((!TE->UserTreeIndex ||
+ !isa<ShuffleVectorInst>(
+ TE->UserTreeIndex.UserTE->getMainOp())) &&
"Does not know how to reorder.");
}
// Update ordering of the operands with the smaller VF than the given
@@ -6232,10 +6324,6 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
- // Do not reorder if operand node is used by many user nodes.
- if (any_of(TE->UserTreeIndices,
- [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
- return false;
// Add the node to the list of the ordered nodes with the identity
// order.
Edges.emplace_back(I, TE);
@@ -6256,10 +6344,8 @@ bool BoUpSLP::canReorderOperands(
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
"Only non-vectorized nodes are expected.");
- if (any_of(TE->UserTreeIndices,
- [UserTE, I](const EdgeInfo &EI) {
- return EI.UserTE == UserTE && EI.EdgeIdx == I;
- })) {
+ if (TE->UserTreeIndex.UserTE == UserTE &&
+ TE->UserTreeIndex.EdgeIdx == I) {
assert(TE->isSame(UserTE->getOperand(I)) &&
"Operand entry does not match operands.");
Gather = TE;
@@ -6286,8 +6372,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
- if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/false)) {
+ if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+ *TE, /*TopToBottom=*/false, IgnoreReorder)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
@@ -6300,29 +6386,24 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
// user node itself.
- SmallPtrSet<const TreeEntry *, 4> Visited;
+ SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
- // 2. If the entry has multiple uses - skip it and jump to the next node.
DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
- TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
- !all_of(drop_begin(TE->UserTreeIndices),
- [TE](const EdgeInfo &EI) {
- return EI.UserTE == TE->UserTreeIndices.front().UserTE;
- }) ||
+ !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
!Visited.insert(TE).second) {
Filtered.push_back(TE);
continue;
}
// Build a map between user nodes and their operands order to speedup
// search. The graph currently does not provide this dependency directly.
- for (EdgeInfo &EI : TE->UserTreeIndices)
- Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
+ Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx,
+ TE);
}
// Erase filtered entries.
for (TreeEntry *TE : Filtered)
@@ -6360,7 +6441,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
continue;
const auto Order = [&]() -> const OrdersType {
if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
- return getReorderingData(*OpTE, /*TopToBottom=*/false)
+ return getReorderingData(*OpTE, /*TopToBottom=*/false,
+ IgnoreReorder)
.value_or(OrdersType(1));
return OpTE->ReorderIndices;
}();
@@ -6368,6 +6450,86 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// orders.
if (Order.size() == 1)
continue;
+
+ // Check that the reordering does not increase number of shuffles, i.e.
+ // same-values-nodes has same parents or their parents has same parents.
+ if (!Order.empty() && !isIdentityOrder(Order)) {
+ Value *Root = OpTE->hasState()
+ ? OpTE->getMainOp()
+ : *find_if_not(OpTE->Scalars, isConstant);
+ auto GetSameNodesUsers = [&](Value *Root) {
+ SmallSetVector<TreeEntry *, 4> Res;
+ for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
+ if (TE != OpTE && TE->UserTreeIndex &&
+ TE->getVectorFactor() == OpTE->getVectorFactor() &&
+ TE->Scalars.size() == OpTE->Scalars.size() &&
+ ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
+ (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
+ Res.insert(TE->UserTreeIndex.UserTE);
+ }
+ for (const TreeEntry *TE : getTreeEntries(Root)) {
+ if (TE != OpTE && TE->UserTreeIndex &&
+ TE->getVectorFactor() == OpTE->getVectorFactor() &&
+ TE->Scalars.size() == OpTE->Scalars.size() &&
+ ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
+ (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
+ Res.insert(TE->UserTreeIndex.UserTE);
+ }
+ return Res.takeVector();
+ };
+ auto GetNumOperands = [](const TreeEntry *TE) {
+ if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
+ return CI->arg_size();
+ return TE->getNumOperands();
+ };
+ auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
+ const TreeEntry *TE) {
+ Intrinsic::ID ID = Intrinsic::not_intrinsic;
+ if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
+ ID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
+ if (ID != Intrinsic::not_intrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
+ continue;
+ const TreeEntry *Op = getOperandEntry(TE, Idx);
+ if (Op->isGather() && Op->hasState()) {
+ const TreeEntry *VecOp =
+ getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
+ if (VecOp)
+ Op = VecOp;
+ }
+ if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
+ return false;
+ }
+ return true;
+ };
+ SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
+ if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
+ if (!RevisitedOps.insert(UTE).second)
+ return false;
+ return UTE == Data.first || !UTE->ReorderIndices.empty() ||
+ !UTE->ReuseShuffleIndices.empty() ||
+ (UTE->UserTreeIndex &&
+ UTE->UserTreeIndex.UserTE == Data.first) ||
+ (Data.first->UserTreeIndex &&
+ Data.first->UserTreeIndex.UserTE == UTE) ||
+ NodeShouldBeReorderedWithOperands(UTE);
+ }))
+ continue;
+ for (TreeEntry *UTE : Users) {
+ Intrinsic::ID ID = Intrinsic::not_intrinsic;
+ if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
+ ID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
+ if (ID != Intrinsic::not_intrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
+ continue;
+ const TreeEntry *Op = getOperandEntry(UTE, Idx);
+ Visited.erase(Op);
+ OrderedEntries.insert(const_cast<TreeEntry *>(Op));
+ }
+ }
+ }
unsigned NumOps = count_if(
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
@@ -6399,15 +6561,16 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return true;
if (TE->isGather()) {
if (GathersToOrders.contains(TE))
- return !getReorderingData(*TE, /*TopToBottom=*/false)
+ return !getReorderingData(*TE, /*TopToBottom=*/false,
+ IgnoreReorder)
.value_or(OrdersType(1))
.empty();
return true;
}
return false;
};
- for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
- TreeEntry *UserTE = EI.UserTE;
+ if (OpTE->UserTreeIndex) {
+ TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
if (!VisitedUsers.insert(UserTE).second)
continue;
// May reorder user node if it requires reordering, has reused
@@ -6425,10 +6588,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Ops, [UserTE, &AllowsReordering](
const std::pair<unsigned, TreeEntry *> &Op) {
return AllowsReordering(Op.second) &&
- all_of(Op.second->UserTreeIndices,
- [UserTE](const EdgeInfo &EI) {
- return EI.UserTE == UserTE;
- });
+ Op.second->UserTreeIndex.UserTE == UserTE;
})) <= Ops.size() / 2)
++Res.first->second;
}
@@ -8253,10 +8413,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
if (E->isSame(VL)) {
- // Record the reuse of the tree node.
- E->UserTreeIndices.push_back(UserTreeIdx);
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
<< ".\n");
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
return;
}
SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end());
@@ -9948,18 +10109,15 @@ void BoUpSLP::transformNodes() {
};
for (auto [Cnt, Sz] : Slices) {
ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
- // If any instruction is vectorized already - do not try again.
- if (TreeEntry *SE = getSameValuesTreeEntry(Slice.front(), Slice,
- /*SameVF=*/true)) {
- SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
- AddCombinedNode(SE->Idx, Cnt, Sz);
- continue;
- }
+ InstructionsState S = getSameOpcode(Slice, *TLI);
+ const TreeEntry *SameTE =
+ S ? getSameValuesTreeEntry(S.getMainOp(), Slice, /*SameVF=*/true)
+ : nullptr;
unsigned PrevSize = VectorizableTree.size();
[[maybe_unused]] unsigned PrevEntriesSize =
LoadEntriesToVectorize.size();
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
- if (PrevSize + 1 == VectorizableTree.size() &&
+ if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
VectorizableTree[PrevSize]->isGather() &&
VectorizableTree[PrevSize]->hasState() &&
VectorizableTree[PrevSize]->getOpcode() !=
@@ -10073,7 +10231,7 @@ void BoUpSLP::transformNodes() {
// This node is a minmax node.
E.CombinedOp = TreeEntry::MinMax;
TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
- if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
+ if (SelectOnly && CondEntry->UserTreeIndex &&
CondEntry->State == TreeEntry::Vectorize) {
// The condition node is part of the combined minmax node.
CondEntry->State = TreeEntry::CombinedVectorize;
@@ -11063,15 +11221,30 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
unsigned Idx) const {
- if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
+ ArrayRef<Value *> VL = E->getOperand(Idx);
+ InstructionsState S = getSameOpcode(VL, *TLI);
+ // Special processing for GEPs bundle, which may include non-gep values.
+ if (!S && VL.front()->getType()->isPointerTy()) {
+ const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
+ if (It != VL.end())
+ S = getSameOpcode(*It, *TLI);
+ }
+ if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S))
return VE;
- const auto *It =
- find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() &&
- find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
- return EI.EdgeIdx == Idx && EI.UserTE == E;
- }) != TE->UserTreeIndices.end();
- });
+ if (S || !isConstant(VL.front())) {
+ for (const TreeEntry *VE :
+ ValueToGatherNodes.lookup(S ? S.getMainOp() : VL.front()))
+ if (VE->UserTreeIndex.EdgeIdx == Idx && VE->UserTreeIndex.UserTE == E) {
+ assert(VE->isSame(VL) && "Expected gather node with same values.");
+ return VE;
+ }
+ }
+ const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() &&
+ TE->UserTreeIndex.EdgeIdx == Idx &&
+ TE->UserTreeIndex.UserTE == E;
+ });
assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
return It->get();
}
@@ -11200,12 +11373,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// resized.
if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
E->Idx != 0 &&
- (E->getOpcode() != Instruction::Load ||
- !E->UserTreeIndices.empty())) {
- const EdgeInfo &EI =
- *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
- return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
- });
+ (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
+ const EdgeInfo &EI = E->UserTreeIndex;
if (EI.UserTE->getOpcode() != Instruction::Select ||
EI.EdgeIdx != 0) {
auto UserBWIt = MinBWs.find(EI.UserTE);
@@ -12506,7 +12675,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Exclude cost of gather loads nodes which are not used. These nodes were
// built as part of the final attempt to vectorize gathered loads.
- assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
+ assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
"Expected gather nodes with users only.");
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
@@ -13109,7 +13278,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// reused elements too for better cost estimation.
const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
- : TE->UserTreeIndices.front();
+ : TE->UserTreeIndex;
const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
const BasicBlock *TEInsertBlock = nullptr;
// Main node of PHI entries keeps the correct order of operands/incoming
@@ -13163,20 +13332,36 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// have a permutation of 2 input vectors.
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
+ SmallPtrSet<const Value *, 16> VisitedValue;
+ auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
+ // The node is reused - exit.
+ if ((TEPtr->getVectorFactor() != VL.size() &&
+ TEPtr->Scalars.size() != VL.size()) ||
+ (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
+ return false;
+ UsedTEs.clear();
+ UsedTEs.emplace_back().insert(TEPtr);
+ for (Value *V : VL) {
+ if (isConstant(V))
+ continue;
+ UsedValuesEntry.try_emplace(V, 0);
+ }
+ return true;
+ };
for (Value *V : VL) {
- if (isConstant(V))
+ if (isConstant(V) || !VisitedValue.insert(V).second)
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
- for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
+ for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
if (TEPtr == TE || TEPtr->Idx == 0)
continue;
assert(any_of(TEPtr->Scalars,
[&](Value *V) { return GatheredScalars.contains(V); }) &&
"Must contain at least single gathered value.");
- assert(TEPtr->UserTreeIndices.size() == 1 &&
+ assert(TEPtr->UserTreeIndex &&
"Expected only single user of a gather node.");
- const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
+ const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
const Instruction *InsertPt =
@@ -13201,6 +13386,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
!CheckOrdering(InsertPt))
continue;
+ // The node is reused - exit.
+ if (CheckAndUseSameNode(TEPtr))
+ break;
VToTEs.insert(TEPtr);
}
if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
@@ -13222,6 +13410,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
}
+ // The node is reused - exit.
+ if (CheckAndUseSameNode(VTE))
+ break;
VToTEs.insert(VTE);
}
if (VToTEs.empty())
@@ -13634,18 +13825,18 @@ BoUpSLP::isGatherShuffledEntry(
if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
return {};
Mask.assign(VL.size(), PoisonMaskElem);
- assert((TE->UserTreeIndices.size() == 1 ||
+ assert((TE->UserTreeIndex ||
TE == VectorizableTree.front().get()) &&
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
- if (!TE->UserTreeIndices.empty() &&
- TE->UserTreeIndices.front().UserTE->isGather() &&
- TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
+ if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
+ TE->UserTreeIndex.EdgeIdx == UINT_MAX) {
assert(
(TE->Idx == 0 ||
(TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
- isSplat(TE->Scalars)) &&
+ isSplat(TE->Scalars) ||
+ getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars)) &&
"Expected splat or extractelements only node.");
return {};
}
@@ -14272,11 +14463,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
(!UTEs.empty() &&
count_if(R.VectorizableTree,
[&](const std::unique_ptr<TreeEntry> &TE) {
- return any_of(TE->UserTreeIndices,
- [&](const EdgeInfo &Edge) {
- return Edge.UserTE ==
- UTEs.front();
- }) &&
+ return TE->UserTreeIndex.UserTE ==
+ UTEs.front() &&
is_contained(VL, EI);
}) != 1);
}))
@@ -14639,32 +14827,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
}
};
-BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
- unsigned NodeIdx) {
- ArrayRef<Value *> VL = E->getOperand(NodeIdx);
- InstructionsState S = getSameOpcode(VL, *TLI);
- // Special processing for GEPs bundle, which may include non-gep values.
- if (!S && VL.front()->getType()->isPointerTy()) {
- const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
- if (It != VL.end())
- S = getSameOpcode(*It, *TLI);
- }
+BoUpSLP::TreeEntry *
+BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S) {
if (!S)
return nullptr;
- auto CheckSameVE = [&](const TreeEntry *VE) {
- return any_of(VE->UserTreeIndices,
- [E, NodeIdx](const EdgeInfo &EI) {
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
- }) ||
- any_of(VectorizableTree,
- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isOperandGatherNode(
- {const_cast<TreeEntry *>(E), NodeIdx}) &&
- VE->isSame(TE->Scalars);
- });
- };
- TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
- if (VE && CheckSameVE(VE))
+ if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
+ VE && VE->UserTreeIndex.UserTE == E &&
+ VE->UserTreeIndex.EdgeIdx == NodeIdx)
return VE;
return nullptr;
}
@@ -14672,8 +14843,15 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
bool PostponedPHIs) {
ValueList &VL = E->getOperand(NodeIdx);
+ InstructionsState S = getSameOpcode(VL, *TLI);
+ // Special processing for GEPs bundle, which may include non-gep values.
+ if (!S && VL.front()->getType()->isPointerTy()) {
+ const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
+ if (It != VL.end())
+ S = getSameOpcode(*It, *TLI);
+ }
const unsigned VF = VL.size();
- if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
+ if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) {
auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
// V may be affected by MinBWs.
// We want ShuffleInstructionBuilder to correctly support REVEC. The key
@@ -14738,14 +14916,13 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
// Need to update the operand gather node, if actually the operand is not a
// vectorized node, but the buildvector/gather node, which matches one of
// the vectorized nodes.
- if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
- }) == VE->UserTreeIndices.end()) {
- auto *It =
- find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
- TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
- });
+ if (VE->UserTreeIndex.UserTE != E || VE->UserTreeIndex.EdgeIdx != NodeIdx) {
+ auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() &&
+ TE->UserTreeIndex.UserTE == E &&
+ TE->UserTreeIndex.EdgeIdx == NodeIdx;
+ });
assert(It != VectorizableTree.end() && "Expected gather node operand.");
(*It)->VectorizedValue = V;
}
@@ -14755,12 +14932,12 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
// Find the corresponding gather entry and vectorize it.
// Allows to be more accurate with tree/graph transformations, checks for the
// correctness of the transformations in many cases.
- auto *I = find_if(VectorizableTree,
+ auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
[E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
return TE->isOperandGatherNode({E, NodeIdx});
});
assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
- assert(I->get()->UserTreeIndices.size() == 1 &&
+ assert(I->get()->UserTreeIndex &&
"Expected only single user for the gather node.");
assert(I->get()->isSame(VL) && "Expected same list of scalars.");
return vectorizeTree(I->get(), PostponedPHIs);
@@ -14812,17 +14989,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
}))
return false;
- TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
- unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+ TreeEntry *UserTE = E->UserTreeIndex.UserTE;
+ unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
if (UserTE->getNumOperands() != 2)
return false;
if (!IsNotPoisonous) {
- auto *It =
- find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
- return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
- return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
- }) != TE->UserTreeIndices.end();
- });
+ auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
+ [=](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->UserTreeIndex.UserTE == UserTE &&
+ TE->UserTreeIndex.EdgeIdx != EdgeIdx;
+ });
if (It == VectorizableTree.end())
return false;
SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
@@ -15036,15 +15212,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
auto *It = find_if(Scalars, [this, E](Value *V) {
return !isa<UndefValue>(V) &&
(isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
- (E->UserTreeIndices.size() == 1 &&
- any_of(V->uses(), [E](const Use &U) {
+ (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
// Check if the value already used in the same operation in
// one of the nodes already.
- return E->UserTreeIndices.front().EdgeIdx !=
- U.getOperandNo() &&
- is_contained(
- E->UserTreeIndices.front().UserTE->Scalars,
- U.getUser());
+ return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
+ is_contained(E->UserTreeIndex.UserTE->Scalars,
+ U.getUser());
})));
});
if (It != Scalars.end()) {
@@ -15356,8 +15529,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
switch (ShuffleOrOp) {
case Instruction::PHI: {
assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
- E != VectorizableTree.front().get() ||
- !E->UserTreeIndices.empty()) &&
+ E != VectorizableTree.front().get() || E->UserTreeIndex) &&
"PHI reordering is free.");
if (PostponedPHIs && E->VectorizedValue)
return E->VectorizedValue;
@@ -16336,8 +16508,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (GatheredLoadsEntriesFirst.has_value() &&
TE->Idx >= *GatheredLoadsEntriesFirst &&
- (!TE->isGather() || !TE->UserTreeIndices.empty())) {
- assert((!TE->UserTreeIndices.empty() ||
+ (!TE->isGather() || TE->UserTreeIndex)) {
+ assert((TE->UserTreeIndex ||
(TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
"Expected gathered load node.");
(void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
@@ -16356,17 +16528,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
for (const TreeEntry *E : PostponedNodes) {
auto *TE = const_cast<TreeEntry *>(E);
- if (auto *VecTE = getSameValuesTreeEntry(
- TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
- TE->UserTreeIndices.front().EdgeIdx));
- VecTE && VecTE->isSame(TE->Scalars))
- // Found gather node which is absolutely the same as one of the
- // vectorized nodes. It may happen after reordering.
- continue;
auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
TE->VectorizedValue = nullptr;
- auto *UserI =
- cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
+ auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
// If user is a PHI node, its vector code have to be inserted right before
// block terminator. Since the node was delayed, there were some unresolved
// dependencies at the moment when stab instruction was emitted. In a case
@@ -16438,7 +16602,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
if (IsSigned.value_or(false)) {
// Final attempt - check user node.
- auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
+ auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
if (It != MinBWs.end())
IsSigned = It->second.second;
}
@@ -16904,15 +17068,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
for (Instruction *I : RemovedInsts) {
const TreeEntry *IE = getTreeEntries(I).front();
if (IE->Idx != 0 &&
- !(VectorizableTree.front()->isGather() &&
- !IE->UserTreeIndices.empty() &&
+ !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
(ValueToGatherNodes.lookup(I).contains(
VectorizableTree.front().get()) ||
- any_of(IE->UserTreeIndices,
- [&](const EdgeInfo &EI) {
- return EI.UserTE == VectorizableTree.front().get() &&
- EI.EdgeIdx == UINT_MAX;
- }))) &&
+ (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
+ IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
!(GatheredLoadsEntriesFirst.has_value() &&
IE->Idx >= *GatheredLoadsEntriesFirst &&
VectorizableTree.front()->isGather() &&
@@ -17820,6 +17980,18 @@ bool BoUpSLP::collectValuesToDemote(
E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
// Demote gathers.
if (Res && E.isGather()) {
+ if (E.hasState()) {
+ if (const TreeEntry *SameTE =
+ getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
+ SameTE)
+ if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
+ ToDemote, Visited, NodesToKeepBWs,
+ MaxDepthLevel, IsProfitableToDemote,
+ IsTruncRoot)) {
+ ToDemote.push_back(E.Idx);
+ return true;
+ }
+ }
// Check possible extractelement instructions bases and final vector
// length.
SmallPtrSet<Value *, 4> UniqueBases;
@@ -17832,13 +18004,15 @@ bool BoUpSLP::collectValuesToDemote(
const unsigned VF = E.Scalars.size();
Type *OrigScalarTy = E.Scalars.front()->getType();
if (UniqueBases.size() <= 2 ||
- ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
::getNumberOfParts(
*TTI,
getWidenedType(
IntegerType::get(OrigScalarTy->getContext(), BitWidth),
- VF)))
+ VF))) {
ToDemote.push_back(E.Idx);
+ return true;
+ }
}
return Res;
};
@@ -17914,12 +18088,6 @@ bool BoUpSLP::collectValuesToDemote(
(void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
std::ref(BitWidth)));
} else {
- // Several vectorized uses? Check if we can truncate it, otherwise -
- // exit.
- if (E.UserTreeIndices.size() > 1 &&
- !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
- std::ref(BitWidth))))
- return false;
bool NeedToExit = false;
if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
return false;
@@ -18162,11 +18330,9 @@ void BoUpSLP::computeMinimumValueSizes() {
// Ensure the roots of the vectorizable tree don't form a cycle.
if (VectorizableTree[NodeIdx]->isGather() ||
- (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
- (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
- [NodeIdx](const EdgeInfo &EI) {
- return EI.UserTE->Idx > NodeIdx;
- })))
+ (NodeIdx == 0 && VectorizableTree[NodeIdx]->UserTreeIndex) ||
+ (NodeIdx != 0 &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Idx > NodeIdx))
return;
// The first value node for store/insertelement is sext/zext/trunc? Skip it,
@@ -18196,7 +18362,7 @@ void BoUpSLP::computeMinimumValueSizes() {
ToDemote.clear();
// Check if the root is trunc and the next node is gather/buildvector, then
// keep trunc in scalars, which is free in most cases.
- if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
+ if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
!NodesToKeepBWs.contains(E.Idx) &&
E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
all_of(E.Scalars, [&](Value *V) {
@@ -18204,7 +18370,7 @@ void BoUpSLP::computeMinimumValueSizes() {
(!V->hasNUsesOrMore(UsesLimit) &&
none_of(V->users(), [&](User *U) {
ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
- const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
+ const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
if (TEs.empty() || is_contained(TEs, UserTE))
return false;
if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
@@ -18224,7 +18390,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}));
})) {
ToDemote.push_back(E.Idx);
- const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
+ const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
auto It = MinBWs.find(UserTE);
if (It != MinBWs.end())
return It->second.first;
@@ -18440,29 +18606,26 @@ void BoUpSLP::computeMinimumValueSizes() {
NodeIdx = NewIdx;
IsTruncRoot =
NodeIdx < VectorizableTree.size() &&
- any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
- [](const EdgeInfo &EI) {
- return EI.EdgeIdx == 0 &&
- EI.UserTE->getOpcode() == Instruction::Trunc &&
- !EI.UserTE->isAltShuffle();
- });
+ VectorizableTree[NodeIdx]->UserTreeIndex &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+ Instruction::Trunc &&
+ !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
IsSignedCmp =
NodeIdx < VectorizableTree.size() &&
- any_of(
- VectorizableTree[NodeIdx]->UserTreeIndices,
- [&](const EdgeInfo &EI) {
- return (EI.UserTE->hasState() &&
- EI.UserTE->getOpcode() == Instruction::ICmp) &&
- any_of(EI.UserTE->Scalars, [&](Value *V) {
- auto *IC = dyn_cast<ICmpInst>(V);
- return IC &&
- (IC->isSigned() ||
+ VectorizableTree[NodeIdx]->UserTreeIndex &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+ Instruction::ICmp &&
+ any_of(VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
+ [&](Value *V) {
+ auto *IC = dyn_cast<ICmpInst>(V);
+ return IC && (IC->isSigned() ||
!isKnownNonNegative(IC->getOperand(0),
SimplifyQuery(*DL)) ||
!isKnownNonNegative(IC->getOperand(1),
SimplifyQuery(*DL)));
- });
- });
+ });
}
// If the maximum bit width we compute is less than the width of the roots'
@@ -18657,8 +18820,10 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
Size = R.getCanonicalGraphSize();
return false;
}
- R.reorderTopToBottom();
- R.reorderBottomToTop();
+ if (R.isProfitableToReorder()) {
+ R.reorderTopToBottom();
+ R.reorderBottomToTop();
+ }
R.transformNodes();
R.buildExternalUses();
@@ -18715,7 +18880,7 @@ static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
return V + (P - Mean) * (P - Mean);
}) /
Num;
- return Dev * 81 / (Mean * Mean) == 0;
+ return Dev * 96 / (Mean * Mean) == 0;
}
bool SLPVectorizerPass::vectorizeStores(
@@ -19251,10 +19416,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.buildTree(Ops);
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
- R.reorderTopToBottom();
- R.reorderBottomToTop(
- /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
- !R.doesRootHaveInTreeUses());
+ if (R.isProfitableToReorder()) {
+ R.reorderTopToBottom();
+ R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
+ }
R.transformNodes();
R.buildExternalUses();
@@ -20184,7 +20349,7 @@ class HorizontalReduction {
}
V.reorderTopToBottom();
// No need to reorder the root node at all.
- V.reorderBottomToTop(!V.doesRootHaveInTreeUses());
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
// Keep extracted other reduction values, if they are used in the
// vectorization trees.
BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 7c5f9847db1f4..8341895ba8268 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -17,7 +17,7 @@ target triple = "aarch64--linux"
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-19'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '8'
+; YAML-NEXT: - TreeSize: '10'
define i32 @test_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
; CHECK-LABEL: @test_select(
@@ -230,7 +230,7 @@ for.end: ; preds = %for.end.loopexit, %
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-41'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '10'
+; YAML-NEXT: - TreeSize: '12'
define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
; CHECK-LABEL: @test_unrolled_select(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
index 219496fc1ac9b..dcdfc6efbfb92 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -7,7 +7,7 @@ define void @f(ptr %r, ptr %w) {
%add0 = fadd double %f0, %f0
%add1 = fadd double %f1, %f1
%w1 = getelementptr inbounds double, ptr %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 4
store double %add0, ptr %w, !dbg !9
store double %add1, ptr %w1
ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index f5e904467baa7..b1bd2546c26f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
; CHECK-NEXT: br label [[FOR_COND15_PREHEADER:%.*]]
; CHECK: for.cond15.preheader:
; CHECK-NEXT: br label [[IF_END:%.*]]
@@ -15,7 +15,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
; CHECK: if.end:
; CHECK-NEXT: br label [[FOR_COND15:%.*]]
; CHECK: for.end39:
-; CHECK-NEXT: switch i32 %arg2, label [[DO_BODY:%.*]] [
+; CHECK-NEXT: switch i32 [[ARG2:%.*]], label [[DO_BODY:%.*]] [
; CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 1, label [[SW_BB195:%.*]]
; CHECK-NEXT: ]
@@ -26,8 +26,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
; CHECK-NEXT: br label [[SW_EPILOG:%.*]]
; CHECK: sw.bb195:
; CHECK-NEXT: br label [[SW_EPILOG]]
@@ -39,7 +38,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
; CHECK: if.end.1:
; CHECK-NEXT: br label [[FOR_COND15_1:%.*]]
; CHECK: for.cond15.1:
-; CHECK-NEXT: br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
+; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
;
entry:
%conv = sitofp i32 undef to double
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
index 644d645b9dc88..ff182ae3f56de 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
@@ -10,7 +10,7 @@
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '6'
+; YAML-NEXT: - TreeSize: '7'
define i32 @min_double(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-LABEL: @min_double(
; CHECK-NEXT: entry:
@@ -44,7 +44,7 @@ entry:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-2'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '6'
+; YAML-NEXT: - TreeSize: '7'
define i32 @min_float(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-LABEL: @min_float(
; CHECK-NEXT: entry:
@@ -78,7 +78,7 @@ entry:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '6'
+; YAML-NEXT: - TreeSize: '7'
define i32 @max_double(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-LABEL: @max_double(
; CHECK-NEXT: entry:
@@ -112,7 +112,7 @@ entry:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-2'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '6'
+; YAML-NEXT: - TreeSize: '7'
define i32 @max_float(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-LABEL: @max_float(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
index b80be40d9fc86..fc62f4d511041 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -31,6 +31,6 @@ define void @fun(ptr nocapture, i32 zeroext) local_unnamed_addr #0 {
._crit_edge: ; preds = %.lr.ph
ret void
-; CHECK: SLP: Adding cost -1 for bundle Idx: 3, n=2 [ %4 = icmp ult i32 %2, %1, ..]
+; CHECK: SLP: Adding cost -1 for bundle Idx: 4, n=2 [ %4 = icmp ult i32 %2, %1, ..]
}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
index 9fa88084aaa0a..a05d4fdd6315b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
@@ -5,14 +5,12 @@ define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i2> @llvm.smin.v2i2(<2 x i2> zeroinitializer, <2 x i2> zeroinitializer)
-; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i2> zeroinitializer, <2 x i2> [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i2> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i2> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = zext i2 [[TMP3]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> zeroinitializer, <2 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[ADD:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[ADD]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i2> [[TMP2]], i32 0
-; CHECK-NEXT: [[ADD45:%.*]] = zext i2 [[TMP5]] to i32
+; CHECK-NEXT: [[ADD45:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[ADD152:%.*]] = or i32 [[ADD45]], [[ADD]]
; CHECK-NEXT: [[IDXPROM153:%.*]] = sext i32 [[ADD152]] to i64
; CHECK-NEXT: [[ARRAYIDX154:%.*]] = getelementptr i8, ptr null, i64 [[IDXPROM153]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
index 7576eb7a8f55e..878b2370bfd2a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
@@ -7,29 +7,29 @@ define i32 @test(i64 %l.549) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1
; CHECK-NEXT: br label %[[IF_THEN19:.*]]
; CHECK: [[P:.*]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
; CHECK-NEXT: br i1 false, label %[[S:.*]], label %[[Q:.*]]
; CHECK: [[Q]]:
; CHECK-NEXT: [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP6]], i64 0)
; CHECK-NEXT: br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]]
; CHECK: [[LOR_LHS_FALSE]]:
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
; CHECK-NEXT: br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]]
; CHECK: [[R]]:
-; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i64> [ [[TMP7]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
; CHECK-NEXT: br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]]
; CHECK: [[LAND_LHS_TRUE]]:
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
; CHECK-NEXT: br i1 false, label %[[Q]], label %[[S]]
; CHECK: [[S]]:
-; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP7]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: br label %[[IF_THEN19]]
; CHECK: [[IF_THEN19]]:
@@ -37,7 +37,7 @@ define i32 @test(i64 %l.549) {
; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1
-; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2)
+; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> [[TMP2]], i64 2)
; CHECK-NEXT: br i1 false, label %[[R]], label %[[IF_END25]]
; CHECK: [[IF_END25]]:
; CHECK-NEXT: br i1 false, label %[[IF_END29]], label %[[P]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
index a7a92bad5e5c1..19c29be1ef384 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -11,21 +11,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3
-; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP9]], <2 x float> [[TMP8]], i64 0)
; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4
; CHECK-NEXT: br label [[IF_END:%.*]]
; CHECK: entry.if.end72_crit_edge:
; CHECK-NEXT: br label [[IF_END72:%.*]]
@@ -48,7 +46,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
-; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4
+; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index 63b41627106e5..53bffe502f3da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -13,7 +13,7 @@ define dso_local i32 @g() local_unnamed_addr {
; CHECK: while.body:
; CHECK-NEXT: [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -24,7 +24,7 @@ define dso_local i32 @g() local_unnamed_addr {
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
; CHECK-NEXT: ]
; CHECK: sw.bb:
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
@@ -36,7 +36,7 @@ define dso_local i32 @g() local_unnamed_addr {
; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> splat (i64 2)
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1
; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
; CHECK: while.body.backedge:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index 166c819098c8c..7960278d2b21d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -10,7 +10,7 @@
; YAML: - String: 'Stores SLP vectorized with cost '
; YAML: - Cost: '-6'
; YAML: - String: ' and with tree size '
-; YAML: - TreeSize: '14'
+; YAML: - TreeSize: '16'
; YAML: ...
; Test that SLP cost modeling is able to match gathering tree
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index 58ea4f8da01a4..60c067e6555ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -18,22 +18,17 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i24> [[TMP7]], splat (i24 255)
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], splat (i24 24)
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> splat (i24 23), <2 x i24> [[TMP8]]
-; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i24> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = and <2 x i32> [[TMP26]], splat (i32 254)
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP13]], splat (i32 4)
-; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP15]], <2 x i8> splat (i8 2), <2 x i8> [[TMP23]]
-; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> splat (i32 2), <2 x i32> [[TMP26]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP14]], splat (i32 32)
-; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP17]], <2 x i8> splat (i8 31), <2 x i8> [[TMP25]]
-; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP17]], <2 x i32> splat (i32 31), <2 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq <2 x i32> [[TMP16]], splat (i32 54)
-; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP27]], <2 x i8> splat (i8 53), <2 x i8> [[TMP18]]
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP21]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> splat (i32 53), <2 x i32> [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
; CHECK-NEXT: store i32 [[TMP19]], ptr [[P1]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1
-; CHECK-NEXT: [[TMP20:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1
; CHECK-NEXT: [[CMP210_NOT:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
; CHECK-NEXT: ret i1 [[CMP210_NOT]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
index d3d7f21ee1003..55f2b238c07df 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll
@@ -8,9 +8,9 @@ define i8 @test() {
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> <i16 poison, i16 0>, i16 [[SUB_I_I79_PEEL_I]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i16> [[TMP3]], [[TMP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP4]], [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
; CHECK-NEXT: [[CONV13_I89_PEEL_I:%.*]] = zext i1 [[TMP5]] to i8
; CHECK-NEXT: ret i8 [[CONV13_I89_PEEL_I]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
index ea6d96147c951..efd44f5a85664 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll
@@ -14,9 +14,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
; CHECK-NEXT: [[TMP12:%.*]] = phi float [ [[TMP7]], [[TMP5]] ], [ [[TMP61:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP13:%.*]] = phi float [ [[TMP6]], [[TMP5]] ], [ [[TMP62:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP14:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP63:%.*]], %[[TMP56]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP16:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP65:%.*]], %[[TMP56]] ]
-; CHECK-NEXT: [[TMP17:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP18:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP67:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP19:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP68:%.*]], %[[TMP56]] ]
; CHECK-NEXT: [[TMP20:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP69:%.*]], %[[TMP56]] ]
@@ -37,15 +37,15 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x float> [[TMP32]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP34:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP33]], <4 x float> zeroinitializer, <4 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP36:%.*]] = fsub float [[TMP17]], [[TMP35]]
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 1
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 0
; CHECK-NEXT: [[TMP38:%.*]] = fsub float [[TMP15]], [[TMP37]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x float> [[TMP25]], i32 1
+; CHECK-NEXT: [[TMP49:%.*]] = fsub float [[TMP17]], [[TMP48]]
; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP14]], i64 0
; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x float> [[TMP25]], <4 x float> poison, <2 x i32> <i32 poison, i32 2>
; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP39]], <2 x float> [[TMP40]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0
-; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP36]], i64 0
+; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP49]], i64 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0
; CHECK-NEXT: [[TMP44:%.*]] = fmul <2 x float> [[TMP42]], [[TMP43]]
; CHECK-NEXT: [[TMP45:%.*]] = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP41]], <2 x float> [[TMP26]], <2 x float> [[TMP44]])
; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP45]], i64 0
@@ -72,9 +72,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) {
; CHECK-NEXT: [[TMP61]] = phi float [ [[TMP12]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
; CHECK-NEXT: [[TMP62]] = phi float [ [[TMP13]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
; CHECK-NEXT: [[TMP63]] = phi float [ [[TMP14]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP9]], %[[BB50]] ]
-; CHECK-NEXT: [[TMP64]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ]
+; CHECK-NEXT: [[TMP64]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ]
; CHECK-NEXT: [[TMP65]] = phi float [ [[TMP16]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP11]], %[[BB50]] ]
-; CHECK-NEXT: [[TMP66]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
+; CHECK-NEXT: [[TMP66]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
; CHECK-NEXT: [[TMP67]] = phi float [ [[TMP18]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
; CHECK-NEXT: [[TMP68]] = phi float [ [[TMP19]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP3]], %[[BB50]] ]
; CHECK-NEXT: [[TMP69]] = phi float [ [[TMP20]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index 242d66fda569a..648f051db4a52 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -7,9 +7,8 @@ define void @wombat(ptr %ptr, ptr %ptr1) {
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
index de4358c47cfd0..eae0ed466b0c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll
@@ -9,9 +9,9 @@ define void @test() {
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i1> [ [[TMP6:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[TMP1]]
; CHECK: 1:
-; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i8> zeroinitializer, [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> zeroinitializer, [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP6]] = and <2 x i1> [[TMP5]], zeroinitializer
; CHECK-NEXT: br label [[FOR_BODY]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
index 83e1bef8fa066..d07353798edc9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
@@ -11,11 +11,11 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) {
; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i8> [[TMP3]], <i8 1, i8 -1>
; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
+; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP30]], <2 x i32> [[TMP5]], i64 0)
; CHECK-NEXT: [[TMP11:%.*]] = uitofp <4 x i32> [[TMP10]] to <4 x float>
; CHECK-NEXT: [[TMP12:%.*]] = fdiv <4 x float> zeroinitializer, [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> poison, i1 [[V4]], i32 0
More information about the llvm-commits
mailing list