[llvm] d74e42a - [SLP]Attempt to vectorize long stores, if short one failed.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 26 06:57:02 PDT 2024
Author: Alexey Bataev
Date: 2024-04-26T06:53:44-07:00
New Revision: d74e42acd2479eb9f3bd8077fd3be2f3395aa638
URL: https://github.com/llvm/llvm-project/commit/d74e42acd2479eb9f3bd8077fd3be2f3395aa638
DIFF: https://github.com/llvm/llvm-project/commit/d74e42acd2479eb9f3bd8077fd3be2f3395aa638.diff
LOG: [SLP]Attempt to vectorize long stores, if short one failed.
We can try to vectorize long store sequences, if short ones were
unsuccessful because of the non-profitable vectorization. It should not
increase compile time significantly (stores are sorted already,
complexity is n x log n), but vectorize extra code.
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0%
test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3%
In all benchmarks extra code gets vectorized
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/88563
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 326006fbb88039..4f99d171469e49 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -153,10 +153,15 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
- bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
- unsigned Idx, unsigned MinVF);
-
- bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
+ std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
+ slpvectorizer::BoUpSLP &R,
+ unsigned Idx, unsigned MinVF,
+ unsigned &Size);
+
+ bool vectorizeStores(
+ ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R,
+ DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+ &Visited);
/// The store instructions in a basic block organized by base pointer.
StoreListMap Stores;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0cd7bd77722260..fbece8c0109c38 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1135,6 +1135,7 @@ class BoUpSLP {
ScalarToTreeEntry.clear();
MultiNodeScalars.clear();
MustGather.clear();
+ NonScheduledFirst.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
ExternalUsesAsGEPs.clear();
@@ -1252,7 +1253,7 @@ class BoUpSLP {
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
- bool isLoadCombineCandidate() const;
+ bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -2356,6 +2357,14 @@ class BoUpSLP {
bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
}
+ /// Checks if the given value is gathered in one of the nodes.
+ bool isGathered(const Value *V) const {
+ return MustGather.contains(V);
+ }
+ /// Checks if the specified value was not schedule.
+ bool isNotScheduled(const Value *V) const {
+ return NonScheduledFirst.contains(V);
+ }
/// Check if the value is vectorized in the tree.
bool isVectorized(Value *V) const { return getTreeEntry(V); }
@@ -3071,6 +3080,9 @@ class BoUpSLP {
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
+ /// A set of first non-schedulable values.
+ ValueSet NonScheduledFirst;
+
/// A map between the vectorized entries and the last instructions in the
/// bundles. The bundles are built in use order, not in the def order of the
/// instructions. So, we cannot rely directly on the last instruction in the
@@ -6646,6 +6658,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
+ NonScheduledFirst.insert(VL.front());
return;
}
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -9587,11 +9600,11 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
/* MatchOr */ false);
}
-bool BoUpSLP::isLoadCombineCandidate() const {
+bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
// Peek through a final sequence of stores and check if all operations are
// likely to be load-combined.
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- for (Value *Scalar : VectorizableTree[0]->Scalars) {
+ unsigned NumElts = Stores.size();
+ for (Value *Scalar : Stores) {
Value *X;
if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
!isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
@@ -15210,8 +15223,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
return Changed;
}
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
- unsigned Idx, unsigned MinVF) {
+std::optional<bool>
+SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+ unsigned Idx, unsigned MinVF,
+ unsigned &Size) {
+ Size = 0;
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);
@@ -15228,11 +15244,42 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
<< "\n");
+ SetVector<Value *> ValOps;
+ for (Value *V : Chain)
+ ValOps.insert(cast<StoreInst>(V)->getValueOperand());
+ // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
+ InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+ if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
+ DenseSet<Value *> Stores(Chain.begin(), Chain.end());
+ bool IsPowerOf2 =
+ isPowerOf2_32(ValOps.size()) ||
+ (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
+ if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
+ (!S.MainOp->isSafeToRemove() ||
+ any_of(ValOps.getArrayRef(),
+ [&](Value *V) {
+ return !isa<ExtractElementInst>(V) &&
+ (V->getNumUses() > Chain.size() ||
+ any_of(V->users(), [&](User *U) {
+ return !Stores.contains(U);
+ }));
+ }))) ||
+ (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
+ Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
+ return false;
+ }
+ }
+ if (R.isLoadCombineCandidate(Chain))
+ return true;
R.buildTree(Chain);
- if (R.isTreeTinyAndNotFullyVectorizable())
- return false;
- if (R.isLoadCombineCandidate())
+ // Check if tree tiny and store itself or its value is not vectorized.
+ if (R.isTreeTinyAndNotFullyVectorizable()) {
+ if (R.isGathered(Chain.front()) ||
+ R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
+ return std::nullopt;
+ Size = R.getTreeSize();
return false;
+ }
R.reorderTopToBottom();
R.reorderBottomToTop();
R.buildExternalUses();
@@ -15240,6 +15287,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.computeMinimumValueSizes();
R.transformNodes();
+ Size = R.getTreeSize();
+ if (S.getOpcode() == Instruction::Load)
+ Size = 2; // cut off masked gather small trees
InstructionCost Cost = R.getTreeCost();
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
@@ -15261,17 +15311,45 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
return false;
}
-bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
- BoUpSLP &R) {
+/// Checks if the quadratic mean deviation is less than 90% of the mean size.
+static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
+ bool First) {
+ unsigned Num = 0;
+ uint64_t Sum = std::accumulate(
+ Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+ [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+ unsigned Size = First ? Val.first : Val.second;
+ if (Size == 1)
+ return V;
+ ++Num;
+ return V + Size;
+ });
+ if (Num == 0)
+ return true;
+ uint64_t Mean = Sum / Num;
+ if (Mean == 0)
+ return true;
+ uint64_t Dev = std::accumulate(
+ Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+ [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+ unsigned P = First ? Val.first : Val.second;
+ if (P == 1)
+ return V;
+ return V + (P - Mean) * (P - Mean);
+ }) /
+ Num;
+ return Dev * 81 / (Mean * Mean) == 0;
+}
+
+bool SLPVectorizerPass::vectorizeStores(
+ ArrayRef<StoreInst *> Stores, BoUpSLP &R,
+ DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+ &Visited) {
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- // Stores the pair of stores (first_store, last_store) in a range, that were
- // already tried to be vectorized. Allows to skip the store ranges that were
- // already tried to be vectorized but the attempts were unsuccessful.
- DenseSet<std::pair<Value *, Value *>> TriedSequences;
struct StoreDistCompare {
bool operator()(const std::pair<unsigned, int> &Op1,
const std::pair<unsigned, int> &Op2) const {
@@ -15299,7 +15377,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
PrevDist = DataVar.second;
});
- if (Operands.size() <= 1)
+ if (Operands.size() <= 1 ||
+ !Visited
+ .insert({Operands.front(),
+ cast<StoreInst>(Operands.front())->getValueOperand(),
+ Operands.back(),
+ cast<StoreInst>(Operands.back())->getValueOperand(),
+ Operands.size()})
+ .second)
continue;
unsigned MaxVecRegSize = R.getMaxVecRegSize();
@@ -15308,13 +15393,19 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MaxVF =
std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+ unsigned MaxRegVF = MaxVF;
auto *Store = cast<StoreInst>(Operands[0]);
Type *StoreTy = Store->getValueOperand()->getType();
Type *ValueTy = StoreTy;
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
ValueTy = Trunc->getSrcTy();
- unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
- R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
+ if (ValueTy == StoreTy &&
+ R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
+ MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
+ unsigned MinVF = std::max<unsigned>(
+ 2, PowerOf2Ceil(TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
+ ValueTy)));
if (MaxVF < MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15329,7 +15420,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
// lanes are used.
unsigned CandVF = Operands.size();
- if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
+ if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
NonPowerOf2VF = CandVF;
}
@@ -15340,40 +15431,184 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
VF = Size > MaxVF ? NonPowerOf2VF : Size;
Size *= 2;
});
- unsigned StartIdx = 0;
- for (unsigned Size : CandidateVFs) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
- assert(
- all_of(
- Slice,
- [&](Value *V) {
- return cast<StoreInst>(V)->getValueOperand()->getType() ==
- cast<StoreInst>(Slice.front())
- ->getValueOperand()
- ->getType();
- }) &&
- "Expected all operands of same type.");
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
- .second &&
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
+ unsigned End = Operands.size();
+ unsigned Repeat = 0;
+ constexpr unsigned MaxAttempts = 4;
+ OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
+ for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
+ P.first = P.second = 1;
+ });
+ DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
+ auto IsNotVectorized = [](bool First,
+ const std::pair<unsigned, unsigned> &P) {
+ return First ? P.first > 0 : P.second > 0;
+ };
+ auto IsVectorized = [](bool First,
+ const std::pair<unsigned, unsigned> &P) {
+ return First ? P.first == 0 : P.second == 0;
+ };
+ auto VFIsProfitable = [](bool First, unsigned Size,
+ const std::pair<unsigned, unsigned> &P) {
+ return First ? Size >= P.first : Size >= P.second;
+ };
+ auto FirstSizeSame = [](unsigned Size,
+ const std::pair<unsigned, unsigned> &P) {
+ return Size == P.first;
+ };
+ while (true) {
+ ++Repeat;
+ bool RepeatChanged = false;
+ bool AnyProfitableGraph;
+ for (unsigned Size : CandidateVFs) {
+ AnyProfitableGraph = false;
+ unsigned StartIdx = std::distance(
+ RangeSizes.begin(),
+ find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
+ std::placeholders::_1)));
+ while (StartIdx < End) {
+ unsigned EndIdx =
+ std::distance(RangeSizes.begin(),
+ find_if(RangeSizes.drop_front(StartIdx),
+ std::bind(IsVectorized, Size >= MaxRegVF,
+ std::placeholders::_1)));
+ unsigned Sz = EndIdx >= End ? End : EndIdx;
+ for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
+ if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
+ Size >= MaxRegVF)) {
+ ++Cnt;
+ continue;
+ }
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+ assert(all_of(Slice,
+ [&](Value *V) {
+ return cast<StoreInst>(V)
+ ->getValueOperand()
+ ->getType() ==
+ cast<StoreInst>(Slice.front())
+ ->getValueOperand()
+ ->getType();
+ }) &&
+ "Expected all operands of same type.");
+ if (!NonSchedulable.empty()) {
+ auto [NonSchedSizeMax, NonSchedSizeMin] =
+ NonSchedulable.lookup(Slice.front());
+ if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
+ Cnt += NonSchedSizeMax;
+ continue;
+ }
+ }
+ unsigned TreeSize;
+ std::optional<bool> Res =
+ vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+ if (!Res) {
+ NonSchedulable
+ .try_emplace(Slice.front(), std::make_pair(Size, Size))
+ .first->getSecond()
+ .second = Size;
+ } else if (*Res) {
+ // Mark the vectorized stores so that we don't vectorize them
+ // again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ // Mark the vectorized stores so that we don't vectorize them
+ // again.
+ AnyProfitableGraph = RepeatChanged = Changed = true;
+ // If we vectorized initial block, no need to try to vectorize
+ // it again.
+ for_each(RangeSizes.slice(Cnt, Size),
+ [](std::pair<unsigned, unsigned> &P) {
+ P.first = P.second = 0;
+ });
+ if (Cnt < StartIdx + MinVF) {
+ for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
+ [](std::pair<unsigned, unsigned> &P) {
+ P.first = P.second = 0;
+ });
+ StartIdx = Cnt + Size;
+ }
+ if (Cnt > Sz - Size - MinVF) {
+ for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
+ [](std::pair<unsigned, unsigned> &P) {
+ P.first = P.second = 0;
+ });
+ if (Sz == End)
+ End = Cnt;
+ Sz = Cnt;
+ }
+ Cnt += Size;
+ continue;
+ }
+ if (Size > 2 && Res &&
+ !all_of(RangeSizes.slice(Cnt, Size),
+ std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+ std::placeholders::_1))) {
+ Cnt += Size;
+ continue;
+ }
+ // Check for the very big VFs that we're not rebuilding same
+ // trees, just with larger number of elements.
+ if (Size > MaxRegVF && TreeSize > 1 &&
+ all_of(RangeSizes.slice(Cnt, Size),
+ std::bind(FirstSizeSame, TreeSize,
+ std::placeholders::_1))) {
+ Cnt += Size;
+ while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
+ ++Cnt;
+ continue;
+ }
+ if (TreeSize > 1)
+ for_each(RangeSizes.slice(Cnt, Size),
+ [&](std::pair<unsigned, unsigned> &P) {
+ if (Size >= MaxRegVF)
+ P.second = std::max(P.second, TreeSize);
+ else
+ P.first = std::max(P.first, TreeSize);
+ });
+ ++Cnt;
+ AnyProfitableGraph = true;
+ }
+ if (StartIdx >= End)
+ break;
+ if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+ AnyProfitableGraph = true;
+ StartIdx = std::distance(
+ RangeSizes.begin(),
+ find_if(RangeSizes.drop_front(Sz),
+ std::bind(IsNotVectorized, Size >= MaxRegVF,
+ std::placeholders::_1)));
}
- ++Cnt;
+ if (!AnyProfitableGraph && Size >= MaxRegVF)
+ break;
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
+ // All values vectorized - exit.
+ if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
+ return P.first == 0 && P.second == 0;
+ }))
break;
+ // Check if tried all attempts or no need for the last attempts at all.
+ if (Repeat >= MaxAttempts ||
+ (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
+ break;
+ constexpr unsigned StoresLimit = 64;
+ const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
+ Operands.size(),
+ static_cast<unsigned>(
+ End -
+ std::distance(
+ RangeSizes.begin(),
+ find_if(RangeSizes, std::bind(IsNotVectorized, true,
+ std::placeholders::_1))) +
+ 1)));
+ unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
+ if (VF > MaxTotalNum || VF >= StoresLimit)
+ break;
+ for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
+ if (P.first != 0)
+ P.first = std::max(P.second, P.first);
+ });
+ // Last attempt to vectorize max number of elements, if all previous
+ // attempts were unsuccessful because of the cost issues.
+ CandidateVFs.clear();
+ CandidateVFs.push_back(VF);
}
}
};
@@ -18191,6 +18426,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
};
// Attempt to sort and vectorize each of the store-groups.
+ DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
for (auto &Pair : Stores) {
if (Pair.second.size() < 2)
continue;
@@ -18208,8 +18444,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
Pair.second.rend());
Changed |= tryToVectorizeSequence<StoreInst>(
ReversedStores, StoreSorter, AreCompatibleStores,
- [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
- return vectorizeStores(Candidates, R);
+ [&](ArrayRef<StoreInst *> Candidates, bool) {
+ return vectorizeStores(Candidates, R, Attempted);
},
/*MaxVFOnly=*/false, R);
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 47d918eabdfe2b..9bbd314a27cb95 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -537,24 +537,18 @@ entry:
}
define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
-; NON-POW2-LABEL: define void @vec3_extract(
-; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: define void @vec3_extract(
-; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
-; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
-; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
-; POW2-ONLY-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
-; POW2-ONLY-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
-; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
-; POW2-ONLY-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
-; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
-; POW2-ONLY-NEXT: ret void
+; CHECK-LABEL: define void @vec3_extract(
+; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; CHECK-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
+; CHECK-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
+; CHECK-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
+; CHECK-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
+; CHECK-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
+; CHECK-NEXT: ret void
;
entry:
%pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43f3..29021150ccd2e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
; CHECK-LABEL: @store_i32(
@@ -98,58 +98,19 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
}
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
-; SSE-LABEL: @store_i64(
-; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @store_i64(
-; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
-; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
-; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
-; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
-; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
-; AVX-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-; AVX-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
-; AVX-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; AVX-NEXT: ret void
+; CHECK-LABEL: @store_i64(
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; CHECK-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT: ret void
;
%4 = zext i32 %1 to i64
%5 = load i64, ptr %0, align 8, !tbaa !7
More information about the llvm-commits
mailing list