[llvm] c65ec9d - Revert "[SLP]Improve isGatherShuffledEntry by trying per-register shuffle."
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 26 08:36:58 PDT 2023
Author: Alexey Bataev
Date: 2023-10-26T08:36:50-07:00
New Revision: c65ec9d9195ad4afee2bbf69fd77607697d43480
URL: https://github.com/llvm/llvm-project/commit/c65ec9d9195ad4afee2bbf69fd77607697d43480
DIFF: https://github.com/llvm/llvm-project/commit/c65ec9d9195ad4afee2bbf69fd77607697d43480.diff
LOG: Revert "[SLP]Improve isGatherShuffledEntry by trying per-register shuffle."
This reverts commit 560bad013ebcb8d2c2c1722e35270b9a70ab40ce to fix
a bug reported in https://lab.llvm.org/buildbot/#/builders/5/builds/37763.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b5da445daaabdb..4f82d2d1d6d91a5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2507,31 +2507,17 @@ class BoUpSLP {
/// instruction in the list).
Instruction &getLastInstructionInBundle(const TreeEntry *E);
- /// Checks if the gathered \p VL can be represented as a single register
- /// shuffle(s) of previous tree entries.
+ /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
+ /// tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
- /// permutations. Must form single-register vector.
+ /// permutations.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
- /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
+ /// previous tree entries. \p Mask is filled with the shuffle mask.
std::optional<TargetTransformInfo::ShuffleKind>
- isGatherShuffledSingleRegisterEntry(
- const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
-
- /// Checks if the gathered \p VL can be represented as multi-register
- /// shuffle(s) of previous tree entries.
- /// \param TE Tree entry checked for permutation.
- /// \param VL List of scalars (a subset of the TE scalar), checked for
- /// permutations.
- /// \returns per-register series of ShuffleKind, if gathered values can be
- /// represented as shuffles of previous tree entries. \p Mask is filled with
- /// the shuffle mask (also on per-register base).
- SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
- isGatherShuffledEntry(
- const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts);
+ isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -7004,11 +6990,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
BoUpSLP &R;
SmallPtrSetImpl<Value *> &CheckedExtracts;
constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- /// While set, still trying to estimate the cost for the same nodes and we
- /// can delay actual cost estimation (virtual shuffle instruction emission).
- /// May help better estimate the cost if same nodes must be permuted + allows
- /// to move most of the long shuffles cost estimation to TTI.
- bool SameNodesEstimated = true;
static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
if (Ty->getScalarType()->isPointerTy()) {
@@ -7249,49 +7230,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
return Cost;
}
- /// Transforms mask \p CommonMask per given \p Mask to make proper set after
- /// shuffle emission.
- static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
- ArrayRef<int> Mask) {
- for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
- if (Mask[Idx] != PoisonMaskElem)
- CommonMask[Idx] = Idx;
- }
- /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
- /// mask \p Mask, register number \p Part, that includes \p SliceSize
- /// elements.
- void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
- ArrayRef<int> Mask, unsigned Part,
- unsigned SliceSize) {
- if (SameNodesEstimated) {
- // Delay the cost estimation if the same nodes are reshuffling.
- // If we already requested the cost of reshuffling of E1 and E2 before, no
- // need to estimate another cost with the sub-Mask, instead include this
- // sub-Mask into the CommonMask to estimate it later and avoid double cost
- // estimation.
- if ((InVectors.size() == 2 &&
- InVectors.front().get<const TreeEntry *>() == &E1 &&
- InVectors.back().get<const TreeEntry *>() == E2) ||
- (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
- assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
- [](int Idx) { return Idx == PoisonMaskElem; }) &&
- "Expected all poisoned elements.");
- ArrayRef<int> SubMask =
- ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
- copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
- return;
- }
- // Found non-matching nodes - need to estimate the cost for the matched
- // and transform mask.
- Cost += createShuffle(InVectors.front(),
- InVectors.size() == 1 ? nullptr : InVectors.back(),
- CommonMask);
- transformMaskAfterShuffle(CommonMask, CommonMask);
- }
- SameNodesEstimated = false;
- Cost += createShuffle(&E1, E2, Mask);
- transformMaskAfterShuffle(CommonMask, Mask);
- }
class ShuffleCostBuilder {
const TargetTransformInfo &TTI;
@@ -7555,74 +7493,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
Cost += computeExtractCost(VL, Mask, ShuffleKind);
- InVectors.assign(1, E);
- CommonMask.assign(Mask.begin(), Mask.end());
- transformMaskAfterShuffle(CommonMask, CommonMask);
- SameNodesEstimated = false;
return VecBase;
}
- void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
- if (&E1 == &E2) {
+ void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+ if (E1 == E2) {
assert(all_of(Mask,
- [&](int Idx) {
- return Idx < static_cast<int>(E1.getVectorFactor());
+ [=](int Idx) {
+ return Idx < static_cast<int>(E1->getVectorFactor());
}) &&
"Expected single vector shuffle mask.");
add(E1, Mask);
return;
}
- if (InVectors.empty()) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign({&E1, &E2});
- return;
- }
- assert(!CommonMask.empty() && "Expected non-empty common mask.");
- auto *MaskVecTy =
- FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
- assert(NumParts > 0 && NumParts < Mask.size() &&
- "Expected positive number of registers.");
- unsigned SliceSize = Mask.size() / NumParts;
- const auto *It =
- find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
- unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
- estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign({E1, E2});
}
- void add(const TreeEntry &E1, ArrayRef<int> Mask) {
- if (InVectors.empty()) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, &E1);
- return;
- }
- assert(!CommonMask.empty() && "Expected non-empty common mask.");
- auto *MaskVecTy =
- FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
- assert(NumParts > 0 && NumParts < Mask.size() &&
- "Expected positive number of registers.");
- unsigned SliceSize = Mask.size() / NumParts;
- const auto *It =
- find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
- unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
- estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
- if (!SameNodesEstimated && InVectors.size() == 1)
- InVectors.emplace_back(&E1);
+ void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, E1);
}
/// Adds another one input vector and the mask for the shuffling.
void add(Value *V1, ArrayRef<int> Mask) {
- if (InVectors.empty()) {
- assert(CommonMask.empty() && "Expected empty input mask/vectors.");
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(1, V1);
- return;
- }
- assert(InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
- !CommonMask.empty() && "Expected only single entry from extracts.");
- InVectors.push_back(V1);
- unsigned VF = CommonMask.size();
- for (unsigned Idx = 0; Idx < VF; ++Idx)
- if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
- CommonMask[Idx] = Mask[Idx] + VF;
+ assert(CommonMask.empty() && InVectors.empty() &&
+ "Expected empty input mask/vectors.");
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(1, V1);
}
Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
@@ -7684,16 +7579,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
ArrayRef<Value *> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
- if (E->State != TreeEntry::NeedToGather) {
- if (auto *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
- ScalarTy = IE->getOperand(1)->getType();
- }
- if (!FixedVectorType::isValidElementType(ScalarTy))
- return InstructionCost::getInvalid();
+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -7705,7 +7596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecTy = FixedVectorType::get(ScalarTy, VL.size());
}
unsigned EntryVF = E->getVectorFactor();
- auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
+ auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
@@ -7738,28 +7629,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallVector<int> Mask;
SmallVector<int> ExtractMask;
std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
- SmallVector<SmallVector<const TreeEntry *>> Entries;
+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+ SmallVector<const TreeEntry *> Entries;
// Check for gathered extracts.
- ExtractShuffle =
- tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+ ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
bool Resized = false;
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
- if (NumParts == 0 || NumParts >= GatheredScalars.size())
- NumParts = 1;
if (Value *VecBase = Estimator.adjustExtracts(
- E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
+ E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
Resized = true;
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
- } else if (ExtractShuffle &&
- TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
- copy(VL, GatheredScalars.begin());
- }
// Do not try to look for reshuffled loads for gathered loads (they will be
// handled later), for vectorized scalars, and cases, which are definitely
@@ -7769,12 +7652,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
- GatherShuffles =
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
- if (!GatherShuffles.empty()) {
- if (GatherShuffles.size() == 1 &&
- *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
- Entries.front().front()->isSame(E->Scalars)) {
+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+ if (GatherShuffle) {
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
@@ -7788,18 +7671,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
continue;
}
if (Mask[I] == PoisonMaskElem)
- Mask[I] = Entries.front().front()->findLaneForValue(V);
+ Mask[I] = Entries.front()->findLaneForValue(V);
}
- Estimator.add(*Entries.front().front(), Mask);
+ Estimator.add(Entries.front(), Mask);
return Estimator.finalize(E->ReuseShuffleIndices);
}
if (!Resized) {
- if (GatheredScalars.size() != VF &&
- any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
- return any_of(TEs, [&](const TreeEntry *TE) {
- return TE->getVectorFactor() == VF;
- });
- }))
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
@@ -7811,21 +7691,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle "
<< shortBundleName(VL) << ".\n");
- unsigned SliceSize = E->Scalars.size() / NumParts;
- SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- for (const auto [I, TEs] : enumerate(Entries)) {
- if (TEs.empty()) {
- assert(!GatherShuffles[I] &&
- "No shuffles with empty entries list expected.");
- continue;
- }
- assert((TEs.size() == 1 || TEs.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
- VecMask.assign(VecMask.size(), PoisonMaskElem);
- copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
- Estimator.add(*TEs.front(), *TEs.back(), VecMask);
- }
+ Estimator.add(Entries.front(), Entries.back(), Mask);
if (all_of(GatheredScalars, PoisonValue ::classof))
return Estimator.finalize(E->ReuseShuffleIndices);
return Estimator.finalize(
@@ -7839,19 +7705,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
if (!all_of(GatheredScalars, PoisonValue::classof)) {
auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
bool SameGathers = VL.equals(Gathers);
- if (!SameGathers)
- return Estimator.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
- Vec = Estimator.gather(
- GatheredScalars, Constant::getNullValue(FixedVectorType::get(
- ScalarTy, GatheredScalars.size())));
- });
- Value *BV = Estimator.gather(Gathers);
+ Value *BV = Estimator.gather(
+ Gathers, SameGathers ? nullptr
+ : Constant::getNullValue(FixedVectorType::get(
+ ScalarTy, GatheredScalars.size())));
SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
Estimator.add(BV, ReuseMask);
}
+ if (ExtractShuffle)
+ Estimator.add(E, std::nullopt);
return Estimator.finalize(E->ReuseShuffleIndices);
}
InstructionCost CommonCost = 0;
@@ -9174,10 +9037,16 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledSingleRegisterEntry(
- const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries) {
Entries.clear();
+ // No need to check for the topmost gather node.
+ if (TE == VectorizableTree.front().get())
+ return std::nullopt;
+ Mask.assign(VL.size(), PoisonMaskElem);
+ assert(TE->UserTreeIndices.size() == 1 &&
+ "Expected only single user of the gather node.");
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
@@ -9252,7 +9121,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
: &getLastInstructionInBundle(UseEI.UserTE);
if (TEInsertPt == InsertPt) {
- // If 2 gathers are operands of the same entry (regardless of whether
+ // If 2 gathers are operands of the same entry (regardless of wether
// user is PHI or else), compare operands indices, use the earlier one
// as the base.
if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
@@ -9317,10 +9186,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
}
}
- if (UsedTEs.empty()) {
- Entries.clear();
+ if (UsedTEs.empty())
return std::nullopt;
- }
unsigned VF = 0;
if (UsedTEs.size() == 1) {
@@ -9336,8 +9203,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
});
if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
Entries.push_back(*It);
- std::iota(std::next(Mask.begin(), Part * VL.size()),
- std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+ std::iota(Mask.begin(), Mask.end(), 0);
// Clear undef scalars.
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
if (isa<PoisonValue>(VL[I]))
@@ -9474,10 +9340,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
TempEntries.push_back(Entries[I]);
}
Entries.swap(TempEntries);
- if (EntryLanes.size() == Entries.size() &&
- !VL.equals(ArrayRef(TE->Scalars)
- .slice(Part * VL.size(),
- std::min<int>(VL.size(), TE->Scalars.size())))) {
+ if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
// We may have here 1 or 2 entries only. If the number of scalars is equal
// to the number of entries, no need to do the analysis, it is not very
// profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -9490,10 +9353,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// Pair.first is the offset to the vector, while Pair.second is the index of
// scalar in the list.
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
- unsigned Idx = Part * VL.size() + Pair.second;
- Mask[Idx] = Pair.first * VF +
- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
- IsIdentity &= Mask[Idx] == Pair.second;
+ Mask[Pair.second] = Pair.first * VF +
+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+ IsIdentity &= Mask[Pair.second] == Pair.second;
}
switch (Entries.size()) {
case 1:
@@ -9508,63 +9370,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
break;
}
Entries.clear();
- // Clear the corresponding mask elements.
- std::fill(std::next(Mask.begin(), Part * VL.size()),
- std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
return std::nullopt;
}
-SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
-BoUpSLP::isGatherShuffledEntry(
- const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
- unsigned NumParts) {
- assert(NumParts > 0 && NumParts < VL.size() &&
- "Expected positive number of registers.");
- Entries.clear();
- // No need to check for the topmost gather node.
- if (TE == VectorizableTree.front().get())
- return {};
- Mask.assign(VL.size(), PoisonMaskElem);
- assert(TE->UserTreeIndices.size() == 1 &&
- "Expected only single user of the gather node.");
- assert(VL.size() % NumParts == 0 &&
- "Number of scalars must be divisible by NumParts.");
- unsigned SliceSize = VL.size() / NumParts;
- SmallVector<std::optional<TTI::ShuffleKind>> Res;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
- ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
- SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
- std::optional<TTI::ShuffleKind> SubRes =
- isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
- if (!SubRes)
- SubEntries.clear();
- Res.push_back(SubRes);
- if (SubEntries.size() == 1 &&
- SubRes.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc &&
- SubEntries.front()->getVectorFactor() == VL.size() &&
- (SubEntries.front()->isSame(TE->Scalars) ||
- SubEntries.front()->isSame(VL))) {
- Entries.clear();
- Res.clear();
- std::iota(Mask.begin(), Mask.end(), 0);
- // Clear undef scalars.
- for (int I = 0, Sz = VL.size(); I < Sz; ++I)
- if (isa<PoisonValue>(VL[I]))
- Mask[I] = PoisonMaskElem;
- Entries.emplace_back(1, SubEntries.front());
- Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
- return Res;
- }
- }
- if (all_of(Res,
- [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
- Entries.clear();
- return {};
- }
- return Res;
-}
-
InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
bool ForPoisonSrc) const {
// Find the type of the operands in VL.
@@ -10031,13 +9839,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
}
/// Checks if the specified entry \p E needs to be delayed because of its
/// dependency nodes.
- Value *needToDelay(const TreeEntry *E,
- ArrayRef<SmallVector<const TreeEntry *>> Deps) {
+ Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
// No need to delay emission if all deps are ready.
- if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
- return all_of(
- TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
- }))
+ if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
return nullptr;
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
@@ -10372,13 +10176,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
SmallVector<int> Mask;
SmallVector<int> ExtractMask;
std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
- SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
- SmallVector<SmallVector<const TreeEntry *>> Entries;
+ std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+ SmallVector<const TreeEntry *> Entries;
Type *ScalarTy = GatheredScalars.front()->getType();
- unsigned NumParts = TTI->getNumberOfParts(
- FixedVectorType::get(ScalarTy, GatheredScalars.size()));
- if (NumParts == 0 || NumParts >= GatheredScalars.size())
- NumParts = 1;
if (!all_of(GatheredScalars, UndefValue::classof)) {
// Check for gathered extracts.
ExtractShuffle =
@@ -10397,10 +10197,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
- GatherShuffles =
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
}
- if (!GatherShuffles.empty()) {
+ if (GatherShuffle) {
if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
// Delay emission of gathers which are not ready yet.
PostponedGathers.insert(E);
@@ -10408,9 +10207,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
// process to keep correct order.
return Delayed;
}
- if (GatherShuffles.size() == 1 &&
- *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
- Entries.front().front()->isSame(E->Scalars)) {
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ Entries.front()->isSame(E->Scalars)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
@@ -10418,11 +10218,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
<< "SLP: perfect diamond match for gather bundle "
<< shortBundleName(E->Scalars) << ".\n");
// Restore the mask for previous partially matched values.
- const TreeEntry *FrontTE = Entries.front().front();
- if (FrontTE->ReorderIndices.empty() &&
- ((FrontTE->ReuseShuffleIndices.empty() &&
- E->Scalars.size() == FrontTE->Scalars.size()) ||
- (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
+ if (Entries.front()->ReorderIndices.empty() &&
+ ((Entries.front()->ReuseShuffleIndices.empty() &&
+ E->Scalars.size() == Entries.front()->Scalars.size()) ||
+ (E->Scalars.size() ==
+ Entries.front()->ReuseShuffleIndices.size()))) {
std::iota(Mask.begin(), Mask.end(), 0);
} else {
for (auto [I, V] : enumerate(E->Scalars)) {
@@ -10430,20 +10230,17 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Mask[I] = PoisonMaskElem;
continue;
}
- Mask[I] = FrontTE->findLaneForValue(V);
+ Mask[I] = Entries.front()->findLaneForValue(V);
}
}
- ShuffleBuilder.add(FrontTE->VectorizedValue, Mask);
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
Res = ShuffleBuilder.finalize(E->getCommonMask());
return Res;
}
if (!Resized) {
- if (GatheredScalars.size() != VF &&
- any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
- return any_of(TEs, [&](const TreeEntry *TE) {
- return TE->getVectorFactor() == VF;
- });
- }))
+ unsigned VF1 = Entries.front()->getVectorFactor();
+ unsigned VF2 = Entries.back()->getVectorFactor();
+ if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
@@ -10543,9 +10340,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
}
};
- if (ExtractShuffle || !GatherShuffles.empty()) {
+ if (ExtractShuffle || GatherShuffle) {
bool IsNonPoisoned = true;
- bool IsUsedInExpr = true;
+ bool IsUsedInExpr = false;
Value *Vec1 = nullptr;
if (ExtractShuffle) {
// Gather of extractelements can be represented as just a shuffle of
@@ -10570,53 +10367,36 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
}
if (Vec2) {
- IsUsedInExpr = false;
IsNonPoisoned &=
isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
} else if (Vec1) {
- IsUsedInExpr &= FindReusedSplat(
+ IsUsedInExpr = FindReusedSplat(
ExtractMask,
cast<FixedVectorType>(Vec1->getType())->getNumElements());
ShuffleBuilder.add(Vec1, ExtractMask);
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
} else {
- IsUsedInExpr = false;
ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
ScalarTy, GatheredScalars.size())),
ExtractMask);
}
}
- if (!GatherShuffles.empty()) {
- unsigned SliceSize = E->Scalars.size() / NumParts;
- SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- for (const auto [I, TEs] : enumerate(Entries)) {
- if (TEs.empty()) {
- assert(!GatherShuffles[I] &&
- "No shuffles with empty entries list expected.");
- continue;
- }
- assert((TEs.size() == 1 || TEs.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
- VecMask.assign(VecMask.size(), PoisonMaskElem);
- copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
- if (TEs.size() == 1) {
- IsUsedInExpr &= FindReusedSplat(
- VecMask,
- cast<FixedVectorType>(TEs.front()->VectorizedValue->getType())
- ->getNumElements());
- ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
- } else {
- IsUsedInExpr = false;
- ShuffleBuilder.add(TEs.front()->VectorizedValue,
- TEs.back()->VectorizedValue, VecMask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
- }
+ if (GatherShuffle) {
+ if (Entries.size() == 1) {
+ IsUsedInExpr = FindReusedSplat(
+ Mask,
+ cast<FixedVectorType>(Entries.front()->VectorizedValue->getType())
+ ->getNumElements());
+ ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+ } else {
+ ShuffleBuilder.add(Entries.front()->VectorizedValue,
+ Entries.back()->VectorizedValue, Mask);
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
}
}
// Try to figure out best way to combine values: build a shuffle and insert
@@ -10627,18 +10407,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
int MSz = Mask.size();
// Try to build constant vector and shuffle with it only if currently we
// have a single permutation and more than 1 scalar constants.
- bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+ bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
bool IsIdentityShuffle =
(ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
TTI::SK_PermuteSingleSrc &&
none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
- (!GatherShuffles.empty() &&
- all_of(GatherShuffles,
- [](const std::optional<TTI::ShuffleKind> &SK) {
- return SK.value_or(TTI::SK_PermuteTwoSrc) ==
- TTI::SK_PermuteSingleSrc;
- }) &&
+ (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
none_of(Mask, [&](int I) { return I >= MSz; }) &&
ShuffleVectorInst::isIdentityMask(Mask, MSz));
bool EnoughConstsForShuffle =
@@ -10814,13 +10590,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
continue;
}
- // if (any_of(E->getOperand(i), [&](Value *V) {
- // auto *I = dyn_cast<Instruction>(V);
- // return I && I->getParent() == IBB;
- // }))
- Builder.SetInsertPoint(IBB->getTerminator());
- // else
- // Builder.SetInsertPoint(IBB->getFirstNonPHIOrDbgOrLifetime());
+ Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
Value *Vec = vectorizeOperand(E, i, /*PostponedPHIs=*/true);
NewPhi->addIncoming(Vec, IBB);
@@ -11484,22 +11254,10 @@ Value *BoUpSLP::vectorizeTree(
// The is because source vector that supposed to feed this gather node was
// inserted at the end of the block [after stab instruction]. So we need
// to adjust insertion point again to the end of block.
- if (isa<PHINode>(UserI)) {
- // Insert before all users.
- Instruction *InsertPt = PrevVec->getParent()->getTerminator();
- for (User *U : PrevVec->users()) {
- if (U == UserI)
- continue;
- auto *UI = dyn_cast<Instruction>(U);
- if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
- continue;
- if (UI->comesBefore(InsertPt))
- InsertPt = UI;
- }
- Builder.SetInsertPoint(InsertPt);
- } else {
+ if (isa<PHINode>(UserI))
+ Builder.SetInsertPoint(PrevVec->getParent()->getTerminator());
+ else
Builder.SetInsertPoint(PrevVec);
- }
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
PrevVec->replaceAllUsesWith(Vec);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index e5b5a5c6c4a00c5..21aac98aa3ece62 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s
define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; CHECK-LABEL: @test(
@@ -14,43 +14,18 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 4>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 5>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
; CHECK-NEXT: br label [[BB]]
;
-; AVX2-LABEL: @test(
-; AVX2-NEXT: entry:
-; AVX2-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
-; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
-; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
-; AVX2-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AVX2-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; AVX2-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
-; AVX2-NEXT: br label [[BB:%.*]]
-; AVX2: bb:
-; AVX2-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; AVX2-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
-; AVX2-NEXT: br label [[BB]]
-;
entry:
%a0 = add i64 %p0, %p0
%a1 = add i64 %p1, %p1
More information about the llvm-commits
mailing list