[llvm] 58b0d7c - [SLP]Improve stores vectorization.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 3 16:18:17 PDT 2023
Author: Alexey Bataev
Date: 2023-08-03T16:14:21-07:00
New Revision: 58b0d7c34ddd9d2117009a8cd7bd5e34a8276082
URL: https://github.com/llvm/llvm-project/commit/58b0d7c34ddd9d2117009a8cd7bd5e34a8276082
DIFF: https://github.com/llvm/llvm-project/commit/58b0d7c34ddd9d2117009a8cd7bd5e34a8276082.diff
LOG: [SLP]Improve stores vectorization.
Use O(nlogn) instead of O(N2) (N <= 32) sorting approach and do not try
to revectorize all possible combinations of stores, if they
definitely cannot be combined because of mem/data dependencies.
Compile time (O3 + lto, skylake_avx512):
External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test 117.15 120.11 2.5%
External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 203.67 207.42 1.8%
External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 232.43 235.01 1.1%
External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 205.49 207.25 0.9%
External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 310.46 306.23 -1.4%
Link time (O3+lto, skylake_avx512):
External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 1383.69 1475.94 6.7%
Other changes are too small, cannot rely on them.
size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Regression/C/Regression-C-sumarray.test 392.00 1439.00 267.1%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 394258.00 394818.00 0.1%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 846355.00 847075.00 0.1%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 782816.00 783360.00 0.1%
test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 779667.00 779923.00 0.0%
test-suite :: MultiSource/Benchmarks/mafft/pairlocalalign.test 224398.00 224446.00 0.0%
test-suite :: MultiSource/Applications/oggenc/oggenc.test 185019.00 185035.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12487610.00 12488010.00 0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1051772.00 1051804.00 0.0%
test-suite :: MultiSource/Applications/SPASS/SPASS.test 529586.00 529602.00 0.0%
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1084684.00 1084716.00 0.0%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1014245.00 1014261.00 0.0%
test-suite :: MultiSource/Benchmarks/MallocBench/espresso/espresso.test 223494.00 223478.00 -0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 660843.00 660795.00 -0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 660843.00 660795.00 -0.0%
test-suite :: MultiSource/Applications/ClamAV/clamscan.test 568824.00 568760.00 -0.0%
espresso - 2 more stores vectorized
x264 - small number of changes in 3-4 functions, generated a bit more
vector stores (2 4x zeroinitializer stores + some other small variations).
clamscan - emitted 32xi8 store instead of several scalar stores + several 4x-8x stores.
Differential Revision: https://reviews.llvm.org/D155246
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll
llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b85becc4fdb7ed..f314d537720855 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -140,10 +140,6 @@ static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
-static cl::opt<int>
-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
- cl::desc("Maximum depth of the lookup for consecutive stores."));
-
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
@@ -12439,138 +12435,185 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
- int E = Stores.size();
- SmallBitVector Tails(E, false);
- int MaxIter = MaxStoreLookup.getValue();
- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
- E, std::make_pair(E, INT_MAX));
- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
- int IterCnt;
- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
- &CheckedPairs,
- &ConsecutiveChain](int K, int Idx) {
- if (IterCnt >= MaxIter)
- return true;
- if (CheckedPairs[Idx].test(K))
- return ConsecutiveChain[K].second == 1 &&
- ConsecutiveChain[K].first == Idx;
- ++IterCnt;
- CheckedPairs[Idx].set(K);
- CheckedPairs[K].set(Idx);
- std::optional<int> Diff = getPointersDiff(
- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
- Stores[Idx]->getValueOperand()->getType(),
- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
- if (!Diff || *Diff == 0)
- return false;
- int Val = *Diff;
- if (Val < 0) {
- if (ConsecutiveChain[Idx].second > -Val) {
- Tails.set(K);
- ConsecutiveChain[Idx] = std::make_pair(K, -Val);
- }
- return false;
+ // Stores the pair of stores (first_store, last_store) in a range, that were
+ // already tried to be vectorized. Allows to skip the store ranges that were
+ // already tried to be vectorized but the attempts were unsuccessful.
+ DenseSet<std::pair<Value *, Value *>> TriedSequences;
+ struct StoreDistCompare {
+ bool operator()(const std::pair<unsigned, int> &Op1,
+ const std::pair<unsigned, int> &Op2) const {
+ return Op1.second < Op2.second;
}
- if (ConsecutiveChain[K].second <= Val)
- return false;
-
- Tails.set(Idx);
- ConsecutiveChain[K] = std::make_pair(Idx, Val);
- return Val == 1;
};
- // Do a quadratic search on all of the given stores in reverse order and find
- // all of the pairs of stores that follow each other.
- for (int Idx = E - 1; Idx >= 0; --Idx) {
- // If a store has multiple consecutive store candidates, search according
- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find slp vectorization opportunity.
- const int MaxLookDepth = std::max(E - Idx, Idx + 1);
- IterCnt = 0;
- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
- break;
- }
-
- // Tracks if we tried to vectorize stores starting from the given tail
- // already.
- SmallBitVector TriedTails(E, false);
- // For stores that start but don't end a link in the chain:
- for (int Cnt = E; Cnt > 0; --Cnt) {
- int I = Cnt - 1;
- if (ConsecutiveChain[I].first == E || Tails.test(I))
- continue;
- // We found a store instr that starts a chain. Now follow the chain and try
- // to vectorize it.
+ // A set of pairs (index of store in Stores array ref, Distance of the store
+ // address relative to base store address in units).
+ using StoreIndexToDistSet =
+ std::set<std::pair<unsigned, int>, StoreDistCompare>;
+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
+ int PrevDist = -1;
BoUpSLP::ValueList Operands;
// Collect the chain into a list.
- while (I != E && !VectorizedStores.count(Stores[I])) {
- Operands.push_back(Stores[I]);
- Tails.set(I);
- if (ConsecutiveChain[I].second != 1) {
- // Mark the new end in the chain and go back, if required. It might be
- // required if the original stores come in reversed order, for example.
- if (ConsecutiveChain[I].first != E &&
- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
- TriedTails.set(I);
- Tails.reset(ConsecutiveChain[I].first);
- if (Cnt < ConsecutiveChain[I].first + 2)
- Cnt = ConsecutiveChain[I].first + 2;
+ for (auto [Idx, Data] : enumerate(Set)) {
+ if (Operands.empty() || Data.second - PrevDist == 1) {
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ if (Idx != Set.size() - 1)
+ continue;
+ }
+ if (Operands.size() <= 1) {
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
+ continue;
+ }
+
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
+
+ unsigned MaxVF =
+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+ auto *Store = cast<StoreInst>(Operands[0]);
+ Type *StoreTy = Store->getValueOperand()->getType();
+ Type *ValueTy = StoreTy;
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ ValueTy = Trunc->getSrcTy();
+ unsigned MinVF = TTI->getStoreMinimumVF(
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
+
+ if (MaxVF <= MinVF) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
+ << ") <= "
+ << "MinVF (" << MinVF << ")\n");
+ }
+
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ unsigned StartIdx = 0;
+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
+ .second &&
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
}
- break;
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size())
+ break;
}
- // Move to the next value in the chain.
- I = ConsecutiveChain[I].first;
+ Operands.clear();
+ Operands.push_back(Stores[Data.first]);
+ PrevDist = Data.second;
}
- assert(!Operands.empty() && "Expected non-empty list of stores.");
+ };
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Operands[0]);
- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
-
- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
- MaxElts);
- auto *Store = cast<StoreInst>(Operands[0]);
- Type *StoreTy = Store->getValueOperand()->getType();
- Type *ValueTy = StoreTy;
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
- ValueTy = Trunc->getSrcTy();
- unsigned MinVF = TTI->getStoreMinimumVF(
- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
-
- if (MaxVF <= MinVF) {
- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
- << "MinVF (" << MinVF << ")\n");
- }
-
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- unsigned StartIdx = 0;
- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
- }
- ++Cnt;
+ // Stores pair (first: index of the store into Stores array ref, address of
+ // which taken as base, second: sorted set of pairs {index, dist}, which are
+ // indices of stores in the set and their store location distances relative to
+ // the base address).
+
+ // Need to store the index of the very first store separately, since the set
+ // may be reordered after the insertion and the first store may be moved. This
+ // container allows to reduce number of calls of getPointersDiff() function.
+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
+ // Inserts the specified store SI with the given index Idx to the set of the
+ // stores. If the store with the same distance is found already - stop
+ // insertion, try to vectorize already found stores. If some stores from this
+ // sequence were not vectorized - try to vectorize them with the new store
+ // later. But this logic is applied only to the stores, that come before the
+ // previous store with the same distance.
+ // Example:
+ // 1. store x, %p
+ // 2. store y, %p+1
+ // 3. store z, %p+2
+ // 4. store a, %p
+ // 5. store b, %p+3
+ // - Scan this from the last to first store. The very first bunch of stores is
+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
+ // vector).
+ // - The next store in the list - #1 - has the same distance from store #5 as
+ // the store #4.
+ // - Try to vectorize sequence of stores 4,2,3,5.
+ // - If all these stores are vectorized - just drop them.
+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
+ // - Start new stores sequence.
+ // The new bunch of stores is {1, {1, 0}}.
+ // - Add the stores from previous sequence, that were not vectorized.
+ // Here we consider the stores in the reversed order, rather they are used in
+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).
+ // Store #3 can be added -> comes after store #4 with the same distance as
+ // store #1.
+ // Store #5 cannot be added - comes before store #4.
+ // This logic allows to improve the compile time, we assume that the stores
+ // after previous store with the same distance most likely have memory
+ // dependencies and no need to waste compile time to try to vectorize them.
+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
+ for (auto &Set : SortedStores) {
+ std::optional<int> Diff = getPointersDiff(
+ Stores[Set.first]->getValueOperand()->getType(),
+ Stores[Set.first]->getPointerOperand(),
+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true);
+ if (!Diff)
+ continue;
+ auto It = Set.second.find(std::make_pair(Idx, *Diff));
+ if (It == Set.second.end()) {
+ Set.second.emplace(Idx, *Diff);
+ return;
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
- break;
+ // Try to vectorize the first found set to avoid duplicate analysis.
+ TryToVectorize(Set.second);
+ StoreIndexToDistSet PrevSet;
+ PrevSet.swap(Set.second);
+ Set.first = Idx;
+ Set.second.emplace(Idx, 0);
+ // Insert stores that followed previous match to try to vectorize them
+ // with this store.
+ unsigned StartIdx = It->first + 1;
+ SmallBitVector UsedStores(Idx - StartIdx);
+ // Distances to previously found dup store (or this store, since they
+ // store to the same addresses).
+ SmallVector<int> Dists(Idx - StartIdx, 0);
+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
+ // Do not try to vectorize sequences, we already tried.
+ if (Pair.first <= It->first ||
+ VectorizedStores.contains(Stores[Pair.first]))
+ break;
+ unsigned BI = Pair.first - StartIdx;
+ UsedStores.set(BI);
+ Dists[BI] = Pair.second - It->second;
+ }
+ for (unsigned I = StartIdx; I < Idx; ++I) {
+ unsigned BI = I - StartIdx;
+ if (BI < UsedStores.size() && UsedStores.test(BI))
+ Set.second.emplace(I, Dists[BI]);
+ }
+ return;
}
- }
+ auto &Res = SortedStores.emplace_back();
+ Res.first = Idx;
+ Res.second.emplace(Idx, 0);
+ };
+ for (auto [I, SI] : enumerate(Stores))
+ FillStoresSet(I, SI);
+
+ // Final vectorization attempt.
+ for (auto &Set : SortedStores)
+ TryToVectorize(Set.second);
return Changed;
}
@@ -15204,8 +15247,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
continue;
+ // Reverse stores to do bottom-to-top analysis. This is important if the
+ // values are stores to the same addresses several times, in this case need
+ // to follow the stores order (reversed to meet the memory dependecies).
+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
+ Pair.second.rend());
Changed |= tryToVectorizeSequence<StoreInst>(
- Pair.second, StoreSorter, AreCompatibleStores,
+ ReversedStores, StoreSorter, AreCompatibleStores,
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
return vectorizeStores(Candidates, R);
},
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll
index ffc858c2537923..ca4c01a7aaa81b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll
@@ -5,10 +5,6 @@ define i32 @test(ptr %p) {
; CHECK-LABEL: define i32 @test
; CHECK-SAME: (ptr [[P:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[IDX2:%.*]] = getelementptr i32, ptr [[P]], i64 4
-; CHECK-NEXT: store i32 0, ptr [[IDX2]], align 4
-; CHECK-NEXT: [[IDX3:%.*]] = getelementptr i32, ptr [[P]], i64 6
-; CHECK-NEXT: store i32 0, ptr [[IDX3]], align 4
; CHECK-NEXT: [[IDX4:%.*]] = getelementptr i32, ptr [[P]], i64 8
; CHECK-NEXT: store i32 0, ptr [[IDX4]], align 4
; CHECK-NEXT: [[IDX5:%.*]] = getelementptr i32, ptr [[P]], i64 10
@@ -69,9 +65,7 @@ define i32 @test(ptr %p) {
; CHECK-NEXT: store i32 0, ptr [[IDX33]], align 4
; CHECK-NEXT: store i32 0, ptr [[P]], align 4
; CHECK-NEXT: [[IDX0:%.*]] = getelementptr i32, ptr [[P]], i64 3
-; CHECK-NEXT: store i32 0, ptr [[IDX0]], align 4
-; CHECK-NEXT: [[IDX1:%.*]] = getelementptr i32, ptr [[P]], i64 5
-; CHECK-NEXT: store i32 0, ptr [[IDX1]], align 4
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[IDX0]], align 4
; CHECK-NEXT: ret i32 0
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
index bbc006b67603cf..a9748ca6291ae2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-max-store-lookup=2 -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s
define i32 @non-ordered-stores(ptr noalias nocapture %in, ptr noalias nocapture %inn, ptr noalias nocapture %out) {
; CHECK-LABEL: @non-ordered-stores(
More information about the llvm-commits
mailing list