[llvm] [SLP][NFC]Extract preliminary checks from buildTree_rec, NFC (PR #134132)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 10 09:31:42 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/134132
>From c950a00c9ca2e5f8bfdb53528f39fa784ec59041 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 2 Apr 2025 18:07:10 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 577 +++++++++---------
1 file changed, 291 insertions(+), 286 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 838e952c024c0..83d9065d0a95c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3857,6 +3857,14 @@ class BoUpSLP {
bool areAltOperandsProfitable(const InstructionsState &S,
ArrayRef<Value *> VL) const;
+ /// Checks if the specified list of the instructions/values can be vectorized
+ /// in general.
+ bool isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
+ const EdgeInfo &UserTreeIdx,
+ InstructionsState &S,
+ bool &TryToFindDuplicates,
+ bool &TrySplitVectorize) const;
+
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
TreeEntry::EntryState
@@ -8820,99 +8828,25 @@ getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
return std::make_pair(MainOp, AltOp);
}
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx,
- unsigned InterleaveFactor) {
+bool BoUpSLP::isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
+ const EdgeInfo &UserTreeIdx,
+ InstructionsState &S,
+ bool &TryToFindDuplicates,
+ bool &TrySplitVectorize) const {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
- SmallVector<int> ReuseShuffleIndices;
- SmallVector<Value *> UniqueValues;
- SmallVector<Value *> NonUniqueValueVL;
- auto TryToFindDuplicates = [&](const InstructionsState &S,
- bool DoNotFail = false) {
- // Check that every instruction appears once in this bundle.
- SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
- for (Value *V : VL) {
- if (isConstant(V)) {
- ReuseShuffleIndices.emplace_back(
- isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
- UniqueValues.emplace_back(V);
- continue;
- }
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndices.emplace_back(Res.first->second);
- if (Res.second)
- UniqueValues.emplace_back(V);
- }
- size_t NumUniqueScalarValues = UniqueValues.size();
- bool IsFullVectors = hasFullVectorsOrPowerOf2(
- *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
- if (NumUniqueScalarValues == VL.size() &&
- (VectorizeNonPowerOf2 || IsFullVectors)) {
- ReuseShuffleIndices.clear();
- } else {
- // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
- if ((UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
- !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()),
- VL.size())) {
- LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
- "for nodes with padding.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return false;
- }
- LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
- (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
- return isa<UndefValue>(V) || !isConstant(V);
- }))) {
- if (DoNotFail && UniquePositions.size() > 1 &&
- NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
- all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
- // Find the number of elements, which forms full vectors.
- unsigned PWSz = getFullVectorNumberOfElements(
- *TTI, UniqueValues.front()->getType(), UniqueValues.size());
- PWSz = std::min<unsigned>(PWSz, VL.size());
- if (PWSz == VL.size()) {
- ReuseShuffleIndices.clear();
- } else {
- NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
- NonUniqueValueVL.append(
- PWSz - UniqueValues.size(),
- PoisonValue::get(UniqueValues.front()->getType()));
- // Check that extended with poisons operations are still valid for
- // vectorization (div/rem are not allowed).
- if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return false;
- }
- VL = NonUniqueValueVL;
- }
- return true;
- }
- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return false;
- }
- VL = UniqueValues;
- }
- return true;
- };
-
- InstructionsState S = getSameOpcode(VL, *TLI);
+ S = getSameOpcode(VL, *TLI);
+ TryToFindDuplicates = true;
+ TrySplitVectorize = false;
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
// place to insert a shuffle if we need to, so just avoid that issue.
if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return;
+ // Do not try to pack to avoid extra instructions here.
+ TryToFindDuplicates = false;
+ return false;
}
// Check if this is a duplicate of another entry.
@@ -8922,24 +8856,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
<< ".\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
if (all_of(VL, [&](Value *V) {
return isa<PoisonValue>(V) || Values.contains(V);
})) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
}
}
@@ -8956,12 +8880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
cast<Instruction>(I)->getOpcode() == S.getOpcode();
})))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
// Don't handle scalable vectors
@@ -8969,174 +8888,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
isa<ScalableVectorType>(
cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
// Don't handle vectors.
if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return;
+ // Do not try to pack to avoid extra instructions here.
+ TryToFindDuplicates = false;
+ return false;
}
- // Tries to build split node.
- constexpr unsigned SmallNodeSize = 4;
- auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
- const InstructionsState &LocalState) {
- if (VL.size() <= SmallNodeSize ||
- TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions)
- return false;
-
- // Any value is used in split node already - just gather.
- if (any_of(VL, [&](Value *V) {
- return ScalarsInSplitNodes.contains(V) || isVectorized(V);
- })) {
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return true;
- }
- SmallVector<Value *> Op1, Op2;
- OrdersType ReorderIndices(VL.size(), VL.size());
- SmallBitVector Op1Indices(VL.size());
- for (auto [Idx, V] : enumerate(VL)) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- Op1.push_back(V);
- Op1Indices.set(Idx);
- continue;
- }
- if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
- I->getOpcode() == LocalState.getOpcode()) ||
- (LocalState.getAltOpcode() == LocalState.getOpcode() &&
- !isAlternateInstruction(I, LocalState.getMainOp(),
- LocalState.getAltOp(), *TLI))) {
- Op1.push_back(V);
- Op1Indices.set(Idx);
- continue;
- }
- Op2.push_back(V);
- }
- Type *ScalarTy = getValueType(VL.front());
- VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
- unsigned Opcode0 = LocalState.getOpcode();
- unsigned Opcode1 = LocalState.getAltOpcode();
- SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
- // Enable split node, only if all nodes do not form legal alternate
- // instruction (like X86 addsub).
- SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
- SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
- if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
- TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
- !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
- !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
- return false;
- // Enable split node, only if all nodes are power-of-2/full registers.
- unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
- for (unsigned Idx : seq<unsigned>(VL.size())) {
- if (Op1Indices.test(Idx)) {
- ReorderIndices[Op1Cnt] = Idx;
- ++Op1Cnt;
- } else {
- ReorderIndices[Op2Cnt] = Idx;
- ++Op2Cnt;
- }
- }
- if (isIdentityOrder(ReorderIndices))
- ReorderIndices.clear();
- SmallVector<int> Mask;
- if (!ReorderIndices.empty())
- inversePermutation(ReorderIndices, Mask);
- unsigned NumParts = TTI.getNumberOfParts(VecTy);
- VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
- VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
- // Check non-profitable single register ops, which better to be represented
- // as alternate ops.
- if (NumParts >= VL.size())
- return false;
- if ((LocalState.getMainOp()->isBinaryOp() &&
- LocalState.getAltOp()->isBinaryOp() &&
- (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
- LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
- (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
- (LocalState.getMainOp()->isUnaryOp() &&
- LocalState.getAltOp()->isUnaryOp())) {
- constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
- InstructionCost InsertCost = ::getShuffleCost(
- TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
- FixedVectorType *SubVecTy =
- getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
- InstructionCost NewShuffleCost =
- ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
- if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
- return false;
- InstructionCost OriginalVecOpsCost =
- TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
- TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
- SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
- for (unsigned Idx : seq<unsigned>(VL.size())) {
- if (isa<PoisonValue>(VL[Idx]))
- continue;
- OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
- }
- InstructionCost OriginalCost =
- OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
- VecTy, OriginalMask, Kind);
- InstructionCost NewVecOpsCost =
- TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
- TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
- InstructionCost NewCost =
- NewVecOpsCost + InsertCost +
- (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::Store
- ? NewShuffleCost
- : 0);
- // If not profitable to split - exit.
- if (NewCost >= OriginalCost)
- return false;
- }
-
- SmallVector<Value *> NewVL(VL.size());
- copy(Op1, NewVL.begin());
- copy(Op2, std::next(NewVL.begin(), Op1.size()));
- auto Invalid = ScheduleBundle::invalid();
- auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
- UserTreeIdx, {}, ReorderIndices);
- LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
- auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
- InstructionsState S = getSameOpcode(Op, *TLI);
- if (S && (isa<LoadInst>(S.getMainOp()) ||
- getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
- // Build gather node for loads, they will be gathered later.
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
- (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
- } else {
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
- buildTree_rec(Op, Depth, {TE, Idx});
- }
- };
- AddNode(Op1, 0);
- AddNode(Op2, 1);
- return true;
- };
-
// If all of the operands are identical or constant we have a simple solution.
// If we deal with insert/extract instructions, they all must have constant
// indices, otherwise we should gather them, not try to vectorize.
// If alternate op node with 2 elements with gathered operands - do not
// vectorize.
- auto &&NotProfitableForVectorization = [&S, this,
- Depth](ArrayRef<Value *> VL) {
+ auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
if (!S || !S.isAltShuffle() || VL.size() > 2)
return false;
if (VectorizableTree.size() < MinTreeSize)
@@ -9216,19 +8984,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
!all_of(VL, isVectorLikeInstWithConstOps)) ||
NotProfitableForVectorization(VL)) {
if (!S) {
- auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
- // Last chance to try to vectorize alternate node.
- if (MainOp && AltOp &&
- TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
- return;
+ LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
+ "C,S,B,O, small shuffle. \n");
+ TrySplitVectorize = true;
+ return false;
}
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
// Don't vectorize ephemeral values.
@@ -9237,9 +8999,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (EphValues.count(V)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is ephemeral.\n");
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
- return;
+ // Do not try to pack to avoid extra instructions here.
+ TryToFindDuplicates = false;
+ return false;
}
}
}
@@ -9288,12 +9050,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (PreferScalarize) {
LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
"node is not profitable.\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
}
@@ -9302,12 +9059,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (UserIgnoreList->contains(V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- if (TryToFindDuplicates(S)) {
- auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- }
- return;
+ return false;
}
}
}
@@ -9337,8 +9089,256 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Do not vectorize EH and non-returning blocks, not profitable in most
// cases.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ return false;
+ }
+ return true;
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+ const EdgeInfo &UserTreeIdx,
+ unsigned InterleaveFactor) {
+ assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+ SmallVector<int> ReuseShuffleIndices;
+ SmallVector<Value *> UniqueValues;
+ SmallVector<Value *> NonUniqueValueVL;
+ auto TryToFindDuplicates = [&](const InstructionsState &S,
+ bool DoNotFail = false) {
+ // Check that every instruction appears once in this bundle.
+ SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
+ for (Value *V : VL) {
+ if (isConstant(V)) {
+ ReuseShuffleIndices.emplace_back(
+ isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
+ UniqueValues.emplace_back(V);
+ continue;
+ }
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndices.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ bool IsFullVectors = hasFullVectorsOrPowerOf2(
+ *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
+ if (NumUniqueScalarValues == VL.size() &&
+ (VectorizeNonPowerOf2 || IsFullVectors)) {
+ ReuseShuffleIndices.clear();
+ } else {
+ // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
+ if ((UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
+ !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()),
+ VL.size())) {
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+ "for nodes with padding.\n");
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
+ (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
+ return isa<UndefValue>(V) || !isConstant(V);
+ }))) {
+ if (DoNotFail && UniquePositions.size() > 1 &&
+ NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
+ all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
+ // Find the number of elements, which forms full vectors.
+ unsigned PWSz = getFullVectorNumberOfElements(
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
+ PWSz = std::min<unsigned>(PWSz, VL.size());
+ if (PWSz == VL.size()) {
+ ReuseShuffleIndices.clear();
+ } else {
+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
+ NonUniqueValueVL.append(
+ PWSz - UniqueValues.size(),
+ PoisonValue::get(UniqueValues.front()->getType()));
+ // Check that extended with poisons operations are still valid for
+ // vectorization (div/rem are not allowed).
+ if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
+ VL = NonUniqueValueVL;
+ }
+ return true;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
+ VL = UniqueValues;
+ }
+ return true;
+ };
+
+ InstructionsState S = InstructionsState::invalid();
+ // Tries to build split node.
+ constexpr unsigned SmallNodeSize = 4;
+ auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
+ const InstructionsState &LocalState) {
+ if (VL.size() <= SmallNodeSize ||
+ TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions)
+ return false;
+
+ // Any value is used in split node already - just gather.
+ if (any_of(VL, [&](Value *V) {
+ return ScalarsInSplitNodes.contains(V) || isVectorized(V);
+ })) {
+ if (TryToFindDuplicates(S)) {
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ }
+ return true;
+ }
+ SmallVector<Value *> Op1, Op2;
+ OrdersType ReorderIndices(VL.size(), VL.size());
+ SmallBitVector Op1Indices(VL.size());
+ for (auto [Idx, V] : enumerate(VL)) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ Op1.push_back(V);
+ Op1Indices.set(Idx);
+ continue;
+ }
+ if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
+ I->getOpcode() == LocalState.getOpcode()) ||
+ (LocalState.getAltOpcode() == LocalState.getOpcode() &&
+ !isAlternateInstruction(I, LocalState.getMainOp(),
+ LocalState.getAltOp(), *TLI))) {
+ Op1.push_back(V);
+ Op1Indices.set(Idx);
+ continue;
+ }
+ Op2.push_back(V);
+ }
+ Type *ScalarTy = getValueType(VL.front());
+ VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
+ unsigned Opcode0 = LocalState.getOpcode();
+ unsigned Opcode1 = LocalState.getAltOpcode();
+ SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
+ // Enable split node, only if all nodes do not form legal alternate
+ // instruction (like X86 addsub).
+ SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
+ SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
+ if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
+ TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
+ !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
+ !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
+ return false;
+ // Enable split node, only if all nodes are power-of-2/full registers.
+ unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
+ for (unsigned Idx : seq<unsigned>(VL.size())) {
+ if (Op1Indices.test(Idx)) {
+ ReorderIndices[Op1Cnt] = Idx;
+ ++Op1Cnt;
+ } else {
+ ReorderIndices[Op2Cnt] = Idx;
+ ++Op2Cnt;
+ }
+ }
+ if (isIdentityOrder(ReorderIndices))
+ ReorderIndices.clear();
+ SmallVector<int> Mask;
+ if (!ReorderIndices.empty())
+ inversePermutation(ReorderIndices, Mask);
+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
+ VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
+ VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
+ // Check non-profitable single register ops, which better to be represented
+ // as alternate ops.
+ if (NumParts >= VL.size())
+ return false;
+ if ((LocalState.getMainOp()->isBinaryOp() &&
+ LocalState.getAltOp()->isBinaryOp() &&
+ (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
+ LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
+ (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
+ (LocalState.getMainOp()->isUnaryOp() &&
+ LocalState.getAltOp()->isUnaryOp())) {
+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+ InstructionCost InsertCost = ::getShuffleCost(
+ TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
+ FixedVectorType *SubVecTy =
+ getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
+ InstructionCost NewShuffleCost =
+ ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
+ if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
+ return false;
+ InstructionCost OriginalVecOpsCost =
+ TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
+ TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
+ SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
+ for (unsigned Idx : seq<unsigned>(VL.size())) {
+ if (isa<PoisonValue>(VL[Idx]))
+ continue;
+ OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
+ }
+ InstructionCost OriginalCost =
+ OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
+ VecTy, OriginalMask, Kind);
+ InstructionCost NewVecOpsCost =
+ TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
+ TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
+ InstructionCost NewCost =
+ NewVecOpsCost + InsertCost +
+ (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::Store
+ ? NewShuffleCost
+ : 0);
+ // If not profitable to split - exit.
+ if (NewCost >= OriginalCost)
+ return false;
+ }
+
+ SmallVector<Value *> NewVL(VL.size());
+ copy(Op1, NewVL.begin());
+ copy(Op2, std::next(NewVL.begin(), Op1.size()));
auto Invalid = ScheduleBundle::invalid();
- newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+ auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
+ UserTreeIdx, {}, ReorderIndices);
+ LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
+ auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
+ InstructionsState S = getSameOpcode(Op, *TLI);
+ if (S && (isa<LoadInst>(S.getMainOp()) ||
+ getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
+ // Build gather node for loads, they will be gathered later.
+ TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
+ Idx == 0 ? 0 : Op1.size());
+ (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
+ } else {
+ TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
+ Idx == 0 ? 0 : Op1.size());
+ buildTree_rec(Op, Depth, {TE, Idx});
+ }
+ };
+ AddNode(Op1, 0);
+ AddNode(Op2, 1);
+ return true;
+ };
+
+ bool TryToPackDuplicates;
+ bool TrySplitVectorize;
+ if (!isLegalToVectorizeScalars(VL, Depth, UserTreeIdx, S, TryToPackDuplicates,
+ TrySplitVectorize)) {
+ if (TrySplitVectorize){
+ auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
+ // Last chance to try to vectorize alternate node.
+ if (MainOp && AltOp &&
+ TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
+ return;
+ }
+ if (!TryToPackDuplicates || TryToFindDuplicates(S)) {
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ }
return;
}
@@ -9351,6 +9351,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
// Perform specific checks for each particular instruction kind.
+ bool IsScatterVectorizeUserTE =
+ UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
OrdersType CurrentOrder;
SmallVector<Value *> PointerOps;
TreeEntry::EntryState State = getScalarsVectorizationState(
@@ -9362,6 +9365,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
+ Instruction *VL0 = S.getMainOp();
+ BasicBlock *BB = VL0->getParent();
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
>From bdd6a0eddf34a4516455e72eca6a0f52a1da9932 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 2 Apr 2025 18:11:38 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83d9065d0a95c..88c0a096ccfcc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9327,7 +9327,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
bool TrySplitVectorize;
if (!isLegalToVectorizeScalars(VL, Depth, UserTreeIdx, S, TryToPackDuplicates,
TrySplitVectorize)) {
- if (TrySplitVectorize){
+ if (TrySplitVectorize) {
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
// Last chance to try to vectorize alternate node.
if (MainOp && AltOp &&
More information about the llvm-commits
mailing list