[llvm] [SLP]Initial support for non-power-of-2 vectorization (PR #151530)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 07:57:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-systemz
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Enables non-power-of-2 vectorization within the SLP tree. The root nodes
are still required to be power-of-2, will be addressed in a follow-up
patches.
---
Patch is 266.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151530.diff
59 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+270-180)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll (+13-11)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll (+4-4)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll (+4-2)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll (+16-14)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll (+10-10)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll (+10-10)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll (+7-4)
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll (+8-17)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll (+3-4)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll (+58-148)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll (+21-23)
- (modified) llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll (+9-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll (+8-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll (+5-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll (+4-12)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll (+1-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/cse.ll (+5-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll (+18-14)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll (+6-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll (+7-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll (+8-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll (+4-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll (+11-15)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll (+4-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll (+8-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll (+9-20)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll (+9-14)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll (+11-24)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll (+11-24)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll (+4-23)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll (+2-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll (+3-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll (-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll (+6-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll (+4-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll (+35-31)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll (+4-11)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll (+4-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll (+2-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll (+3-3)
- (modified) llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll (+25-22)
- (modified) llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll (+2-7)
- (modified) llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll (+4-4)
- (modified) llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll (+26-44)
- (modified) llvm/test/Transforms/SLPVectorizer/revec.ll (+7-9)
- (modified) llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll (+31-16)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 593868fb8811a..e2d10b69fbb0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1856,8 +1856,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
- if (NumParts >= Sz || Sz % NumParts != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+ if (NumParts >= Sz || PWSz % NumParts != 0 ||
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
return NumParts;
}
@@ -1994,6 +1996,9 @@ class BoUpSLP {
VectorizableTree.front()->getVectorFactor());
}
+ /// Returns true if the tree is a reduction tree.
+ bool isReductionTree() const { return UserIgnoreList; }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -2185,6 +2190,21 @@ class BoUpSLP {
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
+ /// Checks if the given array of vectorized values has the same node in the
+ /// tree.
+ bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
+ if (S) {
+ if (any_of(getTreeEntries(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); }))
+ return true;
+ return any_of(ValueToGatherNodes.lookup(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); });
+ }
+ return any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->isSame(VL);
+ });
+ }
+
/// Registers non-vectorizable sequence of loads
template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
@@ -3224,11 +3244,7 @@ class BoUpSLP {
}))
return false;
}
- // TODO: Check if we can remove a check for non-power-2 number of
- // scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 &&
- hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
- UniqueValues.size());
+ return UniqueValues.size() != 2;
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -3663,8 +3679,8 @@ class BoUpSLP {
std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
- bool ForOrder);
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+ unsigned SliceSize);
/// Checks if the gathered \p VL can be represented as multi-register
/// shuffle(s) of previous tree entries.
@@ -4055,17 +4071,6 @@ class BoUpSLP {
return IsNonPowerOf2;
}
- /// Return true if this is a node, which tries to vectorize number of
- /// elements, forming whole vectors.
- bool
- hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
- bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
- TTI, getValueType(Scalars.front()), Scalars.size());
- assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
- "Reshuffling not supported with non-power-of-2 vectors yet.");
- return IsNonPowerOf2;
- }
-
Value *getOrdered(unsigned Idx) const {
assert(isGather() && "Must be used only for buildvectors/gathers.");
if (ReorderIndices.empty())
@@ -4222,12 +4227,6 @@ class BoUpSLP {
if (UserTreeIdx.UserTE)
OperandsToTreeEntry.try_emplace(
std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
- // FIXME: Remove once support for ReuseShuffleIndices has been implemented
- // for non-power-of-two vectors.
- assert(
- (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
- ReuseShuffleIndices.empty()) &&
- "Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
if (ReorderIndices.empty()) {
@@ -4386,21 +4385,16 @@ class BoUpSLP {
class ScalarsVectorizationLegality {
InstructionsState S;
bool IsLegal;
- bool TryToFindDuplicates;
bool TrySplitVectorize;
public:
ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
- bool TryToFindDuplicates = true,
bool TrySplitVectorize = false)
- : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
- TrySplitVectorize(TrySplitVectorize) {
- assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
- "Inconsistent state");
+ : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) {
+ assert((!IsLegal || S.valid()) && "Inconsistent state");
}
const InstructionsState &getInstructionsState() const { return S; };
bool isLegal() const { return IsLegal; }
- bool tryToFindDuplicates() const { return TryToFindDuplicates; }
bool trySplitVectorize() const { return TrySplitVectorize; }
};
@@ -5567,7 +5561,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
ArrayRef<int> Mask, int PartSz, int NumParts,
function_ref<unsigned(unsigned)> GetVF) {
- for (int I : seq<int>(0, NumParts)) {
+ for (int I : seq<int>(NumParts)) {
if (ShuffledSubMasks.test(I))
continue;
const int VF = GetVF(I);
@@ -5618,6 +5612,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
SecondVecFound = true;
break;
}
+ if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
+ break;
if (CurrentOrder[I * PartSz + Idx] >
static_cast<unsigned>(I * PartSz + K) &&
CurrentOrder[I * PartSz + Idx] !=
@@ -5636,12 +5632,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
if (!ExtractShuffles.empty())
TransformMaskToOrder(
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
- if (!ExtractShuffles[I])
+ if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
return 0U;
unsigned VF = 0;
unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
for (unsigned Idx : seq<unsigned>(Sz)) {
int K = I * PartSz + Idx;
+ if (static_cast<unsigned>(K) >= ExtractMask.size())
+ break;
if (ExtractMask[K] == PoisonMaskElem)
continue;
if (!TE.ReuseShuffleIndices.empty())
@@ -5669,7 +5667,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
}
if (!Entries.empty())
TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
- if (!GatherShuffles[I])
+ if (I >= GatherShuffles.size() || !GatherShuffles[I])
return 0U;
return std::max(Entries[I].front()->getVectorFactor(),
Entries[I].back()->getVectorFactor());
@@ -6381,12 +6379,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
if (!TryRecursiveCheck || VL.size() < ListLimit)
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
- // FIXME: The following code has not been updated for non-power-of-2
- // vectors (and not whole registers). The splitting logic here does not
- // cover the original vector if the vector factor is not a power of two.
- if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
- return false;
-
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
unsigned MinVF = getMinVF(2 * Sz);
DemandedElts.clearAllBits();
@@ -6397,8 +6389,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VF >= MinVF;
VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
SmallVector<LoadsState> States;
- for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, std::min(VF, End - Cnt));
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
LoadsState LS =
@@ -6410,7 +6402,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
DemandedElts.setAllBits();
break;
}
- DemandedElts.setBits(Cnt, Cnt + VF);
+ DemandedElts.setBits(Cnt, Cnt + Slice.size());
continue;
}
// If need the reorder - consider as high-cost masked gather for now.
@@ -6436,13 +6428,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VecLdCost +=
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
}
- auto *SubVecTy = getWidenedType(ScalarTy, VF);
for (auto [I, LS] : enumerate(States)) {
+ const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - I * VF);
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[I * VF]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
? 0
- : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+ : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF),
LI0->getPointerOperand(),
Instruction::GetElementPtr, CostKind, ScalarTy,
SubVecTy)
@@ -6456,12 +6449,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
getUnderlyingObject(PointerOps.front());
}))
VectorGEPCost += getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
+ TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
/*Insert=*/true, /*Extract=*/false, CostKind);
else
VectorGEPCost +=
getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
+ TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
/*Insert=*/true, /*Extract=*/false, CostKind) +
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
CostKind);
@@ -6501,7 +6494,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
continue;
}
SmallVector<int> ShuffleMask(VL.size());
- for (int Idx : seq<int>(0, VL.size()))
+ for (int Idx : seq<int>(VL.size()))
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
if (I > 0)
VecLdCost +=
@@ -6740,10 +6733,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
- // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
- assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
- "Reshuffling scalars not yet supported for nodes with padding");
-
if (isSplat(TE.Scalars))
return std::nullopt;
// Check if reuse shuffle indices can be improved by reordering.
@@ -7082,12 +7071,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
Res == LoadsState::CompressVectorize)
return std::move(CurrentOrder);
}
- // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
- // has been auditted for correctness with non-power-of-two vectors.
- if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
- return CurrentOrder;
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
+ return CurrentOrder;
}
return std::nullopt;
}
@@ -7338,7 +7324,7 @@ void BoUpSLP::reorderTopToBottom() {
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
- !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
+ !VFToOrderedEntries.empty() && VF > 1; --VF) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
continue;
@@ -8530,17 +8516,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
AllowToVectorize = CheckIfAllowed(Slice);
} else {
AllowToVectorize =
- (NumElts >= 3 ||
- any_of(ValueToGatherNodes.at(Slice.front()),
- [=](const TreeEntry *TE) {
- return TE->Scalars.size() == 2 &&
- ((TE->Scalars.front() == Slice.front() &&
- TE->Scalars.back() == Slice.back()) ||
- (TE->Scalars.front() == Slice.back() &&
- TE->Scalars.back() == Slice.front()));
- })) &&
- hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
- Slice.size());
+ NumElts >= 3 ||
+ any_of(ValueToGatherNodes.at(Slice.front()),
+ [=](const TreeEntry *TE) {
+ return TE->Scalars.size() == 2 &&
+ ((TE->Scalars.front() == Slice.front() &&
+ TE->Scalars.back() == Slice.back()) ||
+ (TE->Scalars.front() == Slice.back() &&
+ TE->Scalars.back() == Slice.front()));
+ });
}
if (AllowToVectorize) {
SmallVector<Value *> PointerOps;
@@ -9194,10 +9178,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
[[fallthrough]];
case Instruction::ExtractValue: {
bool Reuse = canReuseExtract(VL, CurrentOrder);
- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
- // non-full registers).
- if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
- return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -9705,7 +9685,7 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
const TargetLibraryInfo &TLI,
const InstructionsState &S,
const BoUpSLP::EdgeInfo &UserTreeIdx,
- bool TryPad = false) {
+ const BoUpSLP &R, bool BuildGatherOnly = true) {
// Check that every instruction appears once in this bundle.
SmallVector<Value *> UniqueValues;
SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -9726,66 +9706,151 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
// Easy case: VL has unique values and a "natural" size
size_t NumUniqueScalarValues = UniqueValues.size();
- bool IsFullVectors = hasFullVectorsOrPowerOf2(
- TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
- if (NumUniqueScalarValues == VL.size() &&
- (VectorizeNonPowerOf2 || IsFullVectors)) {
+ if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndices.clear();
return true;
}
+ bool AreAllValuesNonConst = UniquePositions.size() == NumUniqueScalarValues;
+
+ // Check if we need to schedule the scalars. If no, can keep original scalars
+ // and avoid extra shuffles.
+ bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
+ !isVectorLikeInstWithConstOps(S.getMainOp()) &&
+ (S.areInstructionsWithCopyableElements() ||
+ !doesNotNeedToSchedule(UniqueValues));
+ // Drop tail poisons, if the values can be vectorized.
+ if (RequireScheduling) {
+ const auto EndIt =
+ find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
+ IsaPred<PoisonValue>);
+ assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
+ UniqueValues.erase(EndIt.base(), UniqueValues.end());
+ NumUniqueScalarValues = UniqueValues.size();
+ }
+
+ // Checks if unique inserts + shuffle is more profitable than just inserts or
+ // vectorized values.
+ auto EstimatePackPlusShuffleVsInserts = [&]() {
+ // Single instruction/argument insert - no shuffle.
+ if (UniquePositions.size() == 1 &&
+ (NumUniqueScalarValues == 1 ||
+ all_of(UniqueValues, IsaPred<UndefValue, Instruction, Argument>)))
+ return std::make_pair(false, false);
+ // Check if the given list of loads can be effectively vectorized.
+ auto CheckLoads = [&](ArrayRef<Value *> VL, bool IncludeGather) {
+ assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
+ BoUpSLP::OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ BoUpSLP::LoadsState Res =
+ R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps);
+ return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
+ Res == BoUpSLP::LoadsState::ScatterVectorize;
+ };
+ // If the scalars are the operands of the root node - try to vectorize them
+ // with shuffles, otherwise we end up with the gather node, which may be
+ // non-profitable/small-tree for the vectorization.
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 &&
+ !BuildGatherOnly) {
+ if (S && S.getOpcode() == Instruction::Load) {
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ return std::make_pair(
+ true, CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
+ CheckLoads(VL, /*IncludeGather=*/false));
+ }
+ return std::make_pair(true, !RequireScheduling);
+ }
+ // Mark unique scalars, to be gathered/buildvectorized.
+ APInt DemandedElts = APInt::getZero(VL.size());
+ for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) {
+ // Do not include constants.
+ if (P.value() != PoisonMaskElem &&
+ UniquePositions.contains(UniqueValues[P.value()]))
+ DemandedElts.setBit(P.index());
+ });
+ Type *ScalarTy = UniqueValues.front()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
+ auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
+ // No need to schedule scalars and only single register used? Use original
+ // scalars, do not pack.
+ if (!RequireScheduling) {
+ const unsigned NumParts = ::getNumberOfParts(TTI, VecTy);
+ if (VL.size() / NumUniqueScalarValues == 1 &&
+ (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts))
+ return std::make_pair(true, true);
+ }
+ // Check if unique loads more profitable than repeated loads.
+ if (S && S.getOpcode() == Instruction::Load) {
+ bool UniquesVectorized =
+ !CheckLoads(UniqueValues, /*IncludeGather=*/true);
+ if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151530
More information about the llvm-commits
mailing list