[llvm] 7642238 - [SLP]Support reordered buildvector nodes for better clustering
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 07:51:20 PST 2024
Author: Alexey Bataev
Date: 2024-11-06T10:51:15-05:00
New Revision: 76422385c3081475ed1bf0e23aa2f3913e66c5b8
URL: https://github.com/llvm/llvm-project/commit/76422385c3081475ed1bf0e23aa2f3913e66c5b8
DIFF: https://github.com/llvm/llvm-project/commit/76422385c3081475ed1bf0e23aa2f3913e66c5b8.diff
LOG: [SLP]Support reordered buildvector nodes for better clustering
Patch adds reordering of the buildvector nodes for better clustering of
the compatible operations and future vectorization. Includes basic cost
estimation and if the transformation is not profitable - reverts it.
AVX512, -O3+LTO
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CINT2006/401.bzip2/401.bzip2.test 74565.00 75701.00 1.5%
test-suite :: External/SPEC/CINT2017rate/541.leela_r/541.leela_r.test 75773.00 76397.00 0.8%
test-suite :: External/SPEC/CINT2017speed/641.leela_s/641.leela_s.test 75773.00 76397.00 0.8%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2014462.00 2024494.00 0.5%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 395219.00 396979.00 0.4%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 857795.00 859667.00 0.2%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 800472.00 802440.00 0.2%
test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 590699.00 591403.00 0.1%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203006.00 203102.00 0.0%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 42408.00 42424.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12451575.00 12451927.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396480.00 1396448.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396480.00 1396448.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1047708.00 1047580.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 111344.00 111328.00 -0.0%
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1087660.00 1087500.00 -0.0%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 280664.00 280616.00 -0.0%
test-suite :: MultiSource/Applications/sqlite3/sqlite3.test 502646.00 502006.00 -0.1%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1033135.00 1031567.00 -0.2%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2070917.00 2065845.00 -0.2%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2070917.00 2065845.00 -0.2%
test-suite :: External/SPEC/CINT2006/473.astar/473.astar.test 33893.00 33797.00 -0.3%
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39677.00 39549.00 -0.3%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39674.00 39546.00 -0.3%
test-suite :: MultiSource/Benchmarks/MiBench/security-blowfish/security-blowfish.test 11560.00 11512.00 -0.4%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 653867.00 649275.00 -0.7%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 653867.00 649275.00 -0.7%
CINT2006/401.bzip2 - extra code vectorized
CINT2017rate/541.leela_r
CINT2017speed/641.leela_s - function
_ZN9FastBoard25get_pattern3_augment_specEiib not inlined anymore, better
vectorization
CFP2017rate/510.parest_r - better vectorization
JM/ldecod - better vectorization
JM/lencod - same
CINT2006/464.h264ref - extra code vectorized
CFP2006/447.dealII - extra vector code
MiBench/consumer-lame - vectorized 2 loops previously scalar
DOE-ProxyApps-C/miniGMG - small changes
Benchmarks/7zip - extra code vectorized, better vectorization
CFP2017rate/526.blender_r - extra vectorization
CFP2017speed/638.imagick_s
CFP2017rate/538.imagick_r - extra vectorization
MiBench/consumer-jpeg - extra vectorization
CINT2006/400.perlbench - extra vectorization
Prolangs-C/TimberWolfMC - small variations
Applications/sqlite3 - extra function vectorized and inlined
Benchmarks/tramp3d-v4 - extra code vectorized
CINT2017rate/500.perlbench_r
CINT2017speed/600.perlbench_s - extra code vectorized, function digcpy gets
vectorized and inlined
CINT2006/473.astar - extra code vectorized
MiBench/telecomm-gsm - extra code vectorized, better vector code
mediabench/gsm - same
MiBench/security-blowfish - extra code vectorized
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - sub4x4_dct function vectorized and gets
inlined
RISCV-V, SiFive-p670, O3+LTO
CFP2017rate/510.parest_r - extra vectorization
CFP2017rate/526.blender_r - extra vectorization
MiBench/consumer-lame - extra vectorized code
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/114284
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..b2fc1eb37d4590 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3088,6 +3088,10 @@ class BoUpSLP {
SmallVector<OrdersType, 1>
findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
+ /// Tries to reorder the gathering node for better vectorization
+ /// opportunities.
+ void reorderGatherNode(TreeEntry &TE);
+
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -3393,6 +3397,15 @@ class BoUpSLP {
return IsNonPowerOf2;
}
+ Value *getOrdered(unsigned Idx) const {
+ assert(isGather() && "Must be used only for buildvectors/gathers.");
+ if (ReorderIndices.empty())
+ return Scalars[Idx];
+ SmallVector<int> Mask;
+ inversePermutation(ReorderIndices, Mask);
+ return Scalars[Mask[Idx]];
+ }
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -9343,6 +9356,160 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
return std::make_pair(ScalarCost, VecCost);
}
+void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
+ assert(TE.isGather() && TE.ReorderIndices.empty() &&
+ "Expected gather node without reordering.");
+ DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
+ SmallSet<size_t, 2> LoadKeyUsed;
+
+ if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
+ return VectorizableTree[Idx]->isSame(TE.Scalars);
+ }))
+ return;
+
+ auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+ Key = hash_combine(hash_value(LI->getParent()), Key);
+ Value *Ptr =
+ getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI : LIt->second) {
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), *TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey =
+ hash_value(LIt->second.back()->getPointerOperand());
+ return SubKey;
+ }
+ }
+ }
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
+ return hash_value(LI->getPointerOperand());
+ };
+ MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
+ SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
+ bool IsOrdered = true;
+ unsigned NumInstructions = 0;
+ // Try to "cluster" scalar instructions, to be able to build extra vectorized
+ // nodes.
+ for (auto [I, V] : enumerate(TE.Scalars)) {
+ size_t Key = 1, Idx = 1;
+ if (auto *Inst = dyn_cast<Instruction>(V);
+ Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
+ !isDeleted(Inst) && !isVectorized(V)) {
+ std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
+ ++NumInstructions;
+ }
+ auto &Container = SortedValues[Key];
+ if (IsOrdered && !KeyToIndex.contains(V) &&
+ !(isa<Constant, ExtractElementInst>(V) ||
+ isVectorLikeInstWithConstOps(V)) &&
+ ((Container.contains(Idx) &&
+ KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
+ (!Container.empty() && !Container.contains(Idx) &&
+ KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
+ IsOrdered = false;
+ auto &KTI = KeyToIndex[V];
+ if (KTI.empty())
+ Container[Idx].push_back(V);
+ KTI.push_back(I);
+ }
+ SmallVector<std::pair<unsigned, unsigned>> SubVectors;
+ APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
+ if (!IsOrdered && NumInstructions > 1) {
+ unsigned Cnt = 0;
+ TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
+ for (const auto &D : SortedValues) {
+ for (const auto &P : D.second) {
+ unsigned Sz = 0;
+ for (Value *V : P.second) {
+ ArrayRef<unsigned> Indices = KeyToIndex.at(V);
+ for (auto [K, Idx] : enumerate(Indices)) {
+ TE.ReorderIndices[Cnt + K] = Idx;
+ TE.Scalars[Cnt + K] = V;
+ }
+ Sz += Indices.size();
+ Cnt += Indices.size();
+ }
+ if (Sz > 1 && isa<Instruction>(P.second.front())) {
+ const unsigned SubVF = getFloorFullVectorNumberOfElements(
+ *TTI, TE.Scalars.front()->getType(), Sz);
+ SubVectors.emplace_back(Cnt - Sz, SubVF);
+ for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
+ DemandedElts.clearBit(I);
+ } else if (!P.second.empty() && isConstant(P.second.front())) {
+ for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
+ DemandedElts.clearBit(I);
+ }
+ }
+ }
+ }
+ // Reuses always require shuffles, so consider it as profitable.
+ if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
+ return;
+ // Do simple cost estimation.
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost Cost = 0;
+ auto *ScalarTy = TE.Scalars.front()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
+ for (auto [Idx, Sz] : SubVectors) {
+ Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
+ Idx, getWidenedType(ScalarTy, Sz));
+ }
+ Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ int Sz = TE.Scalars.size();
+ SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
+ TE.ReorderIndices.end());
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *V = TE.getOrdered(I);
+ if (isa<PoisonValue>(V)) {
+ ReorderMask[I] = PoisonMaskElem;
+ } else if (isConstant(V) || DemandedElts[I]) {
+ ReorderMask[I] = I + TE.ReorderIndices.size();
+ }
+ }
+ Cost += ::getShuffleCost(*TTI,
+ any_of(ReorderMask, [&](int I) { return I >= Sz; })
+ ? TTI::SK_PermuteTwoSrc
+ : TTI::SK_PermuteSingleSrc,
+ VecTy, ReorderMask);
+ DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+ ReorderMask.assign(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *V = TE.getOrdered(I);
+ if (isConstant(V)) {
+ DemandedElts.clearBit(I);
+ if (!isa<PoisonValue>(V))
+ ReorderMask[I] = I;
+ } else {
+ ReorderMask[I] = I + Sz;
+ }
+ }
+ InstructionCost BVCost = TTI->getScalarizationOverhead(
+ VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+ if (!DemandedElts.isAllOnes())
+ BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
+ if (Cost >= BVCost) {
+ SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
+ reorderScalars(TE.Scalars, Mask);
+ TE.ReorderIndices.clear();
+ }
+}
+
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
@@ -9380,6 +9547,14 @@ void BoUpSLP::transformNodes() {
findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
});
};
+
+ // Try to reorder gather nodes for better vectorization opportunities.
+ for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+ TreeEntry &E = *VectorizableTree[Idx];
+ if (E.isGather())
+ reorderGatherNode(E);
+ }
+
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
TreeEntry &E = *VectorizableTree[Idx];
@@ -9522,6 +9697,12 @@ void BoUpSLP::transformNodes() {
AddCombinedNode(PrevSize, Cnt, Sz);
}
}
+ // Restore ordering, if no extra vectorization happened.
+ if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
+ SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
+ reorderScalars(E.Scalars, Mask);
+ E.ReorderIndices.clear();
+ }
}
switch (E.getOpcode()) {
case Instruction::Load: {
@@ -10209,7 +10390,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (Mask.empty())
return nullptr;
Value *VecBase = nullptr;
- ArrayRef<Value *> VL = E->Scalars;
+ SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
+ reorderScalars(VL, ReorderMask);
+ }
// Check if it can be considered reused if same extractelements were
// vectorized already.
bool PrevNodeFound = any_of(
@@ -10230,7 +10416,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned Part : seq<unsigned>(NumParts)) {
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
- for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
+ for (auto [I, V] :
+ enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
// Ignore non-extractelement scalars.
if (isa<UndefValue>(V) ||
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -10367,10 +10554,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
[&](auto P) {
if (P.value() == PoisonMaskElem)
return Mask[P.index()] == PoisonMaskElem;
- auto *EI =
- cast<ExtractElementInst>(InVectors.front()
- .get<const TreeEntry *>()
- ->Scalars[P.index()]);
+ auto *EI = cast<ExtractElementInst>(
+ InVectors.front().get<const TreeEntry *>()->getOrdered(
+ P.index()));
return EI->getVectorOperand() == V1 ||
EI->getVectorOperand() == V2;
}) &&
@@ -10387,22 +10573,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
if (ForExtracts) {
// No need to add vectors here, already handled them in adjustExtracts.
- assert(InVectors.size() == 1 &&
- InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
- all_of(enumerate(CommonMask),
- [&](auto P) {
- Value *Scalar = InVectors.front()
- .get<const TreeEntry *>()
- ->Scalars[P.index()];
- if (P.value() == PoisonMaskElem)
- return P.value() == Mask[P.index()] ||
- isa<UndefValue>(Scalar);
- if (isa<Constant>(V1))
- return true;
- auto *EI = cast<ExtractElementInst>(Scalar);
- return EI->getVectorOperand() == V1;
- }) &&
- "Expected only tree entry for extractelement vectors.");
+ assert(
+ InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
+ !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar =
+ InVectors.front().get<const TreeEntry *>()->getOrdered(
+ P.index());
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
return;
}
assert(!InVectors.empty() && !CommonMask.empty() &&
@@ -10473,7 +10660,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InstructionCost
finalize(ArrayRef<int> ExtMask,
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
- unsigned VF = 0,
+ ArrayRef<int> SubVectorsMask, unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
if (Action) {
@@ -10500,6 +10687,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
+ // Add subvectors permutation cost.
+ if (!SubVectorsMask.empty()) {
+ assert(SubVectorsMask.size() == CommonMask.size() &&
+ "Expected same size of masks for subvectors and common mask.");
+ SmallVector<int> SVMask(SubVectorsMask.begin(), SubVectorsMask.end());
+ for (auto [I1, I2] : zip(SVMask, CommonMask)) {
+ if (I2 != PoisonMaskElem) {
+ assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
+ I1 = I2 + CommonMask.size();
+ }
+ }
+ Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
+ getWidenedType(ScalarTy, CommonMask.size()),
+ SVMask, CostKind);
+ }
for (auto [E, Idx] : SubVectors) {
Type *EScalarTy = E->Scalars.front()->getType();
bool IsSigned = true;
@@ -13542,11 +13744,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
UseVecBaseAsInput = false;
SmallPtrSet<Value *, 4> UniqueBases;
Value *VecBase = nullptr;
+ SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
+ reorderScalars(VL, ReorderMask);
+ }
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
int Idx = Mask[I];
if (Idx == PoisonMaskElem)
continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ auto *EI = cast<ExtractElementInst>(VL[I]);
VecBase = EI->getVectorOperand();
if (const TreeEntry *TE = R.getTreeEntry(VecBase))
VecBase = TE->VectorizedValue;
@@ -13555,7 +13763,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// If the only one use is vectorized - can delete the extractelement
// itself.
if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
- (NumParts != 1 && count(E->Scalars, EI) > 1) ||
+ (NumParts != 1 && count(VL, EI) > 1) ||
any_of(EI->users(), [&](User *U) {
const TreeEntry *UTE = R.getTreeEntry(U);
return !UTE || R.MultiNodeScalars.contains(U) ||
@@ -13567,7 +13775,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
[&](const EdgeInfo &Edge) {
return Edge.UserTE == UTE;
}) &&
- is_contained(TE->Scalars, EI);
+ is_contained(VL, EI);
}) != 1;
}))
continue;
@@ -13589,15 +13797,14 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// into a long virtual vector register, forming the original vector.
Value *Vec = nullptr;
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
for (unsigned Part : seq<unsigned>(NumParts)) {
- unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
- ArrayRef<Value *> VL =
- ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+ unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+ ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
constexpr int MaxBases = 2;
SmallVector<Value *, MaxBases> Bases(MaxBases);
- auto VLMask = zip(VL, SubMask);
+ auto VLMask = zip(SubVL, SubMask);
const unsigned VF = std::accumulate(
VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
if (std::get<1>(D) == PoisonMaskElem)
@@ -13814,7 +14021,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
Value *
finalize(ArrayRef<int> ExtMask,
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
- unsigned VF = 0,
+ ArrayRef<int> SubVectorsMask, unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
@@ -13858,21 +14065,59 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
- for (auto [E, Idx] : SubVectors) {
- Value *V = E->VectorizedValue;
- if (V->getType()->isIntOrIntVectorTy())
- V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
- unsigned InsertionIndex = Idx * ScalarTyNumElements;
- Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
- Builder.getInt64(InsertionIndex));
- if (!CommonMask.empty()) {
- std::iota(std::next(CommonMask.begin(), InsertionIndex),
- std::next(CommonMask.begin(), (Idx + E->getVectorFactor()) *
- ScalarTyNumElements),
- InsertionIndex);
+ auto CreateSubVectors = [&](Value *Vec,
+ SmallVectorImpl<int> &CommonMask) {
+ for (auto [E, Idx] : SubVectors) {
+ Value *V = E->VectorizedValue;
+ if (V->getType()->isIntOrIntVectorTy())
+ V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
+ return !isKnownNonNegative(
+ V, SimplifyQuery(*R.DL));
+ }));
+ unsigned InsertionIndex = Idx * ScalarTyNumElements;
+ const unsigned SubVecVF =
+ cast<FixedVectorType>(V->getType())->getNumElements();
+ if (InsertionIndex % SubVecVF == 0) {
+ Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
+ Builder.getInt64(InsertionIndex));
+ } else {
+ // Create shuffle, insertvector requires that index is multiple of
+ // the subvectors length.
+ const unsigned VecVF =
+ cast<FixedVectorType>(Vec->getType())->getNumElements();
+ SmallVector<int> Mask(VecVF, PoisonMaskElem);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ for (unsigned I : seq<unsigned>(
+ InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
+ Mask[I] = I - Idx + VecVF;
+ Vec = createShuffle(Vec, V, Mask);
+ }
+ if (!CommonMask.empty()) {
+ std::iota(
+ std::next(CommonMask.begin(), InsertionIndex),
+ std::next(CommonMask.begin(),
+ (Idx + E->getVectorFactor()) * ScalarTyNumElements),
+ InsertionIndex);
+ }
+ }
+ return Vec;
+ };
+ if (SubVectorsMask.empty()) {
+ Vec = CreateSubVectors(Vec, CommonMask);
+ } else {
+ SmallVector<int> SVMask(SubVectorsMask.begin(), SubVectorsMask.end());
+ for (auto [I1, I2] : zip(SVMask, CommonMask)) {
+ if (I2 != PoisonMaskElem) {
+ assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
+ I1 = I2 + CommonMask.size();
+ }
+ }
+ Value *InsertVec =
+ CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
+ Vec = createShuffle(InsertVec, Vec, SVMask);
+ for (unsigned I : seq<unsigned>(CommonMask.size())) {
+ if (SVMask[I] != PoisonMaskElem)
+ CommonMask[I] = I;
}
}
InVectors.front() = Vec;
@@ -13968,7 +14213,10 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
return std::make_pair(VectorizableTree[P.first].get(),
P.second);
});
- return ShuffleBuilder.finalize({}, SubVectors);
+ assert((E->CombinedEntriesWithIndices.empty() ||
+ E->ReorderIndices.empty()) &&
+ "Expected either combined subnodes or reordering");
+ return ShuffleBuilder.finalize({}, SubVectors, {});
};
Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF * getNumElements(VL[0]->getType()) !=
@@ -14062,10 +14310,22 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
});
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
+ SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
+ SmallVector<int> SubVectorsMask;
+ inversePermutation(E->ReorderIndices, SubVectorsMask);
+ // Transform non-clustered elements in the mask to poison (-1).
+ // "Clustered" operations will be reordered using this mask later.
+ if (!SubVectors.empty() && !SubVectorsMask.empty()) {
+ for (unsigned I : seq<unsigned>(GatheredScalars.size()))
+ if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
+ SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
+ } else {
+ SubVectorsMask.clear();
+ }
+ SmallVector<Value *> StoredGS(GatheredScalars);
auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
unsigned I, unsigned SliceSize,
bool IsNotPoisonous) {
@@ -14150,7 +14410,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
if (I == PoisonMaskElem)
continue;
if (const auto *TE = getTreeEntry(
- cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
+ cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
ExtractEntries.push_back(TE);
}
if (std::optional<ResTy> Delayed =
@@ -14222,7 +14482,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
}
ShuffleBuilder.add(*FrontTE, Mask);
- Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors);
+ Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
+ SubVectorsMask);
return Res;
}
if (!Resized) {
@@ -14352,7 +14613,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
continue;
if (isa<UndefValue>(E->Scalars[I]))
continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ auto *EI = cast<ExtractElementInst>(StoredGS[I]);
Value *VecOp = EI->getVectorOperand();
if (const auto *TE = getTreeEntry(VecOp))
if (TE->VectorizedValue)
@@ -14483,10 +14744,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
(IsSingleShuffle && ((IsIdentityShuffle &&
IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
}))
- Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
+ SubVectorsMask);
else
Res = ShuffleBuilder.finalize(
- E->ReuseShuffleIndices, SubVectors, E->Scalars.size(),
+ E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
@@ -14497,7 +14759,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
ShuffleBuilder.add(BV, ReuseMask);
- Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
+ SubVectorsMask);
} else {
// Gather all constants.
SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
@@ -14507,7 +14770,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
Value *BV = ShuffleBuilder.gather(GatheredScalars);
ShuffleBuilder.add(BV, Mask);
- Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
+ Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
+ SubVectorsMask);
}
if (NeedFreeze)
@@ -14574,7 +14838,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
return std::make_pair(VectorizableTree[P.first].get(), P.second);
});
- return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
+ assert(
+ (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
+ "Expected either combined subnodes or reordering");
+ return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
};
assert(!E->isGather() && "Unhandled state");
@@ -15992,7 +16259,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
ShuffleBuilder.add(V1, CombinedMask1);
if (V2)
ShuffleBuilder.add(V2, CombinedMask2);
- return ShuffleBuilder.finalize({}, {});
+ return ShuffleBuilder.finalize({}, {}, {});
};
auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index 6ff03acf85cdfd..c976525b6720eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -4,12 +4,14 @@
define i64 @foo(i32 %tmp7) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 undef, i32 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[TMP5:%.*]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 [[TMP24]], i32 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 [[TMP8]], i32 3
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 8, i32 5, i32 10, i32 7>
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0)
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 [[TMP24]], i32 6
+; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 poison, i32 2, i32 3, i32 poison, i32 14, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
More information about the llvm-commits
mailing list