[llvm] [SLP]Support reordered buildvector nodes for better clustering (PR #114284)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 30 11:20:21 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Patch adds reordering of the buildvector nodes for better clustering of
the compatible operations and future vectorization. Includes basic cost
estimation and if the transformation is not profitable - reverts it.
AVX512, -O3+LTO
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CINT2006/401.bzip2/401.bzip2.test 74565.00 75701.00 1.5%
test-suite :: External/SPEC/CINT2017rate/541.leela_r/541.leela_r.test 75773.00 76397.00 0.8%
test-suite :: External/SPEC/CINT2017speed/641.leela_s/641.leela_s.test 75773.00 76397.00 0.8%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2014462.00 2024494.00 0.5%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 395219.00 396979.00 0.4%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 857795.00 859667.00 0.2%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 800472.00 802440.00 0.2%
test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 590699.00 591403.00 0.1%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 203006.00 203102.00 0.0%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 42408.00 42424.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12451575.00 12451927.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1396480.00 1396448.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1396480.00 1396448.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1047708.00 1047580.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 111344.00 111328.00 -0.0%
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1087660.00 1087500.00 -0.0%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 280664.00 280616.00 -0.0%
test-suite :: MultiSource/Applications/sqlite3/sqlite3.test 502646.00 502006.00 -0.1%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1033135.00 1031567.00 -0.2%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 2070917.00 2065845.00 -0.2%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 2070917.00 2065845.00 -0.2%
test-suite :: External/SPEC/CINT2006/473.astar/473.astar.test 33893.00 33797.00 -0.3%
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 39677.00 39549.00 -0.3%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 39674.00 39546.00 -0.3%
test-suite :: MultiSource/Benchmarks/MiBench/security-blowfish/security-blowfish.test 11560.00 11512.00 -0.4%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 653867.00 649275.00 -0.7%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 653867.00 649275.00 -0.7%
CINT2006/401.bzip2 - extra code vectorized
CINT2017rate/541.leela_r
CINT2017speed/641.leela_s - function
_ZN9FastBoard25get_pattern3_augment_specEiib not inlined anymore, better
vectorization
CFP2017rate/510.parest_r - better vectorization
JM/ldecod - better vectorization
JM/lencod - same
CINT2006/464.h264ref - extra code vectorized
CFP2006/447.dealII - extra vector code
MiBench/consumer-lame - vectorized 2 loops previously scalar
DOE-ProxyApps-C/miniGMG - small changes
Benchmarks/7zip - extra code vectorized, better vectorization
CFP2017rate/526.blender_r - extra vectorization
CFP2017speed/638.imagick_s
CFP2017rate/538.imagick_r - extra vectorization
MiBench/consumer-jpeg - extra vectorization
CINT2006/400.perlbench - extra vectorization
Prolangs-C/TimberWolfMC - small variations
Applications/sqlite3 - extra function vectorized and inlined
Benchmarks/tramp3d-v4 - extra code vectorized
CINT2017rate/500.perlbench_r
CINT2017speed/600.perlbench_s - extra code vectorized, function digcpy gets
vectorized and inlined
CINT2006/473.astar - extra code vectorized
MiBench/telecomm-gsm - extra code vectorized, better vector code
mediabench/gsm - same
MiBench/security-blowfish - extra code vectorized
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - sub4x4_dct function vectorized and gets
inlined
RISCV-V, SiFive-p670, O3+LTO
CFP2017rate/510.parest_r - extra vectorization
CFP2017rate/526.blender_r - extra vectorization
MiBench/consumer-lame - extra vectorized code
---
Patch is 29.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114284.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+320-57)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll (+7-5)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 268546fe99e138..ca5944df41f9b7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3088,6 +3088,10 @@ class BoUpSLP {
SmallVector<OrdersType, 1>
findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
+ /// Tries to reorder the gathering node for better vectorization
+ /// opportunities.
+ void reorderGatherNode(TreeEntry &TE);
+
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -3393,6 +3397,15 @@ class BoUpSLP {
return IsNonPowerOf2;
}
+ Value *getOrdered(unsigned Idx) const {
+ assert(isGather() && "Must be used only for buildvectors/gathers.");
+ if (ReorderIndices.empty())
+ return Scalars[Idx];
+ SmallVector<int> Mask;
+ inversePermutation(ReorderIndices, Mask);
+ return Scalars[Mask[Idx]];
+ }
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -9336,6 +9349,159 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
return std::make_pair(ScalarCost, VecCost);
}
+void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
+ assert(TE.isGather() && TE.ReorderIndices.empty() &&
+ "Expected gather node without reordering.");
+ DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
+ SmallSet<size_t, 2> LoadKeyUsed;
+
+ if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
+ return VectorizableTree[Idx]->isSame(TE.Scalars);
+ }))
+ return;
+
+ auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+ Key = hash_combine(hash_value(LI->getParent()), Key);
+ Value *Ptr = getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
+ if (LoadKeyUsed.contains(Key)) {
+ auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
+ if (LIt != LoadsMap.end()) {
+ for (LoadInst *RLI : LIt->second) {
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), *DL, *SE,
+ /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ for (LoadInst *RLI : LIt->second) {
+ if (arePointersCompatible(RLI->getPointerOperand(),
+ LI->getPointerOperand(), *TLI)) {
+ hash_code SubKey = hash_value(RLI->getPointerOperand());
+ return SubKey;
+ }
+ }
+ if (LIt->second.size() > 2) {
+ hash_code SubKey =
+ hash_value(LIt->second.back()->getPointerOperand());
+ return SubKey;
+ }
+ }
+ }
+ LoadKeyUsed.insert(Key);
+ LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
+ return hash_value(LI->getPointerOperand());
+ };
+ MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
+ SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
+ bool IsOrdered = true;
+ unsigned NumInstructions = 0;
+ // Try to "cluster" scalar instructions, to be able to build extra vectorized
+ // nodes.
+ for (auto [I, V] : enumerate(TE.Scalars)) {
+ size_t Key = 1, Idx = 1;
+ if (auto *Inst = dyn_cast<Instruction>(V);
+ Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
+ !isDeleted(Inst) && !isVectorized(V)) {
+ std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
+ ++NumInstructions;
+ }
+ auto &Container = SortedValues[Key];
+ if (IsOrdered && !KeyToIndex.contains(V) &&
+ !(isa<Constant, ExtractElementInst>(V) ||
+ isVectorLikeInstWithConstOps(V)) &&
+ ((Container.contains(Idx) &&
+ KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
+ (!Container.empty() && !Container.contains(Idx) &&
+ KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
+ IsOrdered = false;
+ auto &KTI = KeyToIndex[V];
+ if (KTI.empty())
+ Container[Idx].push_back(V);
+ KTI.push_back(I);
+ }
+ SmallVector<std::pair<unsigned, unsigned>> SubVectors;
+ APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
+ if (!IsOrdered && NumInstructions > 1) {
+ unsigned Cnt = 0;
+ TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
+ for (const auto &D : SortedValues) {
+ for (const auto &P : D.second) {
+ unsigned Sz = 0;
+ for (Value *V : P.second) {
+ ArrayRef<unsigned> Indices = KeyToIndex.at(V);
+ for (auto [K, Idx] : enumerate(Indices)) {
+ TE.ReorderIndices[Cnt + K] = Idx;
+ TE.Scalars[Cnt + K] = V;
+ }
+ Sz += Indices.size();
+ Cnt += Indices.size();
+ }
+ if (Sz > 1 && isa<Instruction>(P.second.front())) {
+ const unsigned SubVF = getFloorFullVectorNumberOfElements(
+ *TTI, TE.Scalars.front()->getType(), Sz);
+ SubVectors.emplace_back(Cnt - Sz, SubVF);
+ for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
+ DemandedElts.clearBit(I);
+ } else if (!P.second.empty() && isConstant(P.second.front())) {
+ for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
+ DemandedElts.clearBit(I);
+ }
+ }
+ }
+ }
+ // Reuses always require shuffles, so consider it as profitable.
+ if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
+ return;
+ // Do simple cost estimation.
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost Cost = 0;
+ auto *ScalarTy = TE.Scalars.front()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
+ for (auto [Idx, Sz] : SubVectors) {
+ Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
+ Idx, getWidenedType(ScalarTy, Sz));
+ }
+ Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ int Sz = TE.Scalars.size();
+ SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
+ TE.ReorderIndices.end());
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *V = TE.getOrdered(I);
+ if (isa<PoisonValue>(V)) {
+ ReorderMask[I] = PoisonMaskElem;
+ } else if (isConstant(V) || DemandedElts[I]) {
+ ReorderMask[I] = I + TE.ReorderIndices.size();
+ }
+ }
+ Cost += ::getShuffleCost(*TTI,
+ any_of(ReorderMask, [&](int I) { return I >= Sz; })
+ ? TTI::SK_PermuteTwoSrc
+ : TTI::SK_PermuteSingleSrc,
+ VecTy, ReorderMask);
+ DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+ ReorderMask.assign(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ Value *V = TE.getOrdered(I);
+ if (isConstant(V)) {
+ DemandedElts.clearBit(I);
+ if (!isa<PoisonValue>(V))
+ ReorderMask[I] = I;
+ } else {
+ ReorderMask[I] = I + Sz;
+ }
+ }
+ InstructionCost BVCost = TTI->getScalarizationOverhead(
+ VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+ if (!DemandedElts.isAllOnes())
+ BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
+ if (Cost >= BVCost) {
+ SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
+ reorderScalars(TE.Scalars, Mask);
+ TE.ReorderIndices.clear();
+ }
+}
+
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
@@ -9373,6 +9539,14 @@ void BoUpSLP::transformNodes() {
findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
});
};
+
+ // Try to reorder gather nodes for better vectorization opportunities.
+ for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+ TreeEntry &E = *VectorizableTree[Idx];
+ if (E.isGather())
+ reorderGatherNode(E);
+ }
+
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
TreeEntry &E = *VectorizableTree[Idx];
@@ -9515,6 +9689,12 @@ void BoUpSLP::transformNodes() {
AddCombinedNode(PrevSize, Cnt, Sz);
}
}
+ // Restore ordering, if no extra vectorization happened.
+ if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
+ SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
+ reorderScalars(E.Scalars, Mask);
+ E.ReorderIndices.clear();
+ }
}
switch (E.getOpcode()) {
case Instruction::Load: {
@@ -10202,7 +10382,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (Mask.empty())
return nullptr;
Value *VecBase = nullptr;
- ArrayRef<Value *> VL = E->Scalars;
+ SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
+ reorderScalars(VL, ReorderMask);
+ }
// Check if it can be considered reused if same extractelements were
// vectorized already.
bool PrevNodeFound = any_of(
@@ -10223,7 +10408,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned Part : seq<unsigned>(NumParts)) {
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
- for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
+ for (auto [I, V] : enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
// Ignore non-extractelement scalars.
if (isa<UndefValue>(V) ||
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -10360,10 +10545,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
[&](auto P) {
if (P.value() == PoisonMaskElem)
return Mask[P.index()] == PoisonMaskElem;
- auto *EI =
- cast<ExtractElementInst>(InVectors.front()
- .get<const TreeEntry *>()
- ->Scalars[P.index()]);
+ auto *EI = cast<ExtractElementInst>(
+ InVectors.front().get<const TreeEntry *>()->getOrdered(
+ P.index()));
return EI->getVectorOperand() == V1 ||
EI->getVectorOperand() == V2;
}) &&
@@ -10380,22 +10564,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
if (ForExtracts) {
// No need to add vectors here, already handled them in adjustExtracts.
- assert(InVectors.size() == 1 &&
- InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
- all_of(enumerate(CommonMask),
- [&](auto P) {
- Value *Scalar = InVectors.front()
- .get<const TreeEntry *>()
- ->Scalars[P.index()];
- if (P.value() == PoisonMaskElem)
- return P.value() == Mask[P.index()] ||
- isa<UndefValue>(Scalar);
- if (isa<Constant>(V1))
- return true;
- auto *EI = cast<ExtractElementInst>(Scalar);
- return EI->getVectorOperand() == V1;
- }) &&
- "Expected only tree entry for extractelement vectors.");
+ assert(
+ InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
+ !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar =
+ InVectors.front().get<const TreeEntry *>()->getOrdered(
+ P.index());
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
return;
}
assert(!InVectors.empty() && !CommonMask.empty() &&
@@ -10466,7 +10651,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InstructionCost
finalize(ArrayRef<int> ExtMask,
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
- unsigned VF = 0,
+ ArrayRef<int> SubVectorsMask, unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
if (Action) {
@@ -10493,6 +10678,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
+ // Add subvectors permutation cost.
+ if (!SubVectorsMask.empty()) {
+ assert(SubVectorsMask.size() == CommonMask.size() &&
+ "Expected same size of masks for subvectors and common mask.");
+ SmallVector<int> SVMask(SubVectorsMask.begin(), SubVectorsMask.end());
+ for (auto [I1, I2] : zip(SVMask, CommonMask)) {
+ if (I2 != PoisonMaskElem) {
+ assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
+ I1 = I2 + CommonMask.size();
+ }
+ }
+ Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
+ getWidenedType(ScalarTy, CommonMask.size()),
+ SVMask, CostKind);
+ }
for (auto [E, Idx] : SubVectors) {
Type *EScalarTy = E->Scalars.front()->getType();
bool IsSigned = true;
@@ -13533,11 +13733,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
UseVecBaseAsInput = false;
SmallPtrSet<Value *, 4> UniqueBases;
Value *VecBase = nullptr;
+ SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
+ reorderScalars(VL, ReorderMask);
+ }
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
int Idx = Mask[I];
if (Idx == PoisonMaskElem)
continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+ auto *EI = cast<ExtractElementInst>(VL[I]);
VecBase = EI->getVectorOperand();
if (const TreeEntry *TE = R.getTreeEntry(VecBase))
VecBase = TE->VectorizedValue;
@@ -13546,7 +13752,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// If the only one use is vectorized - can delete the extractelement
// itself.
if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
- (NumParts != 1 && count(E->Scalars, EI) > 1) ||
+ (NumParts != 1 && count(VL, EI) > 1) ||
any_of(EI->users(), [&](User *U) {
const TreeEntry *UTE = R.getTreeEntry(U);
return !UTE || R.MultiNodeScalars.contains(U) ||
@@ -13558,7 +13764,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
[&](const EdgeInfo &Edge) {
return Edge.UserTE == UTE;
}) &&
- is_contained(TE->Scalars, EI);
+ is_contained(VL, EI);
}) != 1;
}))
continue;
@@ -13580,15 +13786,14 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// into a long virtual vector register, forming the original vector.
Value *Vec = nullptr;
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
for (unsigned Part : seq<unsigned>(NumParts)) {
- unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
- ArrayRef<Value *> VL =
- ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+ unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+ ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
constexpr int MaxBases = 2;
SmallVector<Value *, MaxBases> Bases(MaxBases);
- auto VLMask = zip(VL, SubMask);
+ auto VLMask = zip(SubVL, SubMask);
const unsigned VF = std::accumulate(
VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
if (std::get<1>(D) == PoisonMaskElem)
@@ -13805,7 +14010,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
Value *
finalize(ArrayRef<int> ExtMask,
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
- unsigned VF = 0,
+ ArrayRef<int> SubVectorsMask, unsigned VF = 0,
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
SmallVector<int> NewExtMask(ExtMask);
@@ -13850,19 +14055,55 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (CommonMask[Idx] != PoisonMaskElem)
CommonMask[Idx] = Idx;
- for (auto [E, Idx] : SubVectors) {
- Value *V = E->VectorizedValue;
- if (V->getType()->isIntOrIntVectorTy())
- V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
- Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
- Builder.getInt64(Idx));
- if (!CommonMask.empty()) {
- std::iota(std::next(CommonMask.begin(), Idx),
- std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
- Idx);
+ auto CreateSubVectors = [&](Value *Vec,
+ SmallVectorImpl<int> &CommonMask) {
+ for (auto [E, Idx] : SubVectors) {
+ Value *V = E->VectorizedValue;
+ if (V->getType()->isIntOrIntVectorTy())
+ V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
+ return !isKnownNonNegative(
+ V, SimplifyQuery(*R.DL));
+ }));
+ const unsigned SubVecVF =
+ cast<FixedVectorType>(V->getType())->getNumElements();
+ if (Idx % SubVecVF == 0) {
+ Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
+ Builder.getInt64(Idx));
+ } else {
+ // Create shuffle, insertvector requires that index is multiple of
+ // the subvectors length.
+ const unsigned VecVF =
+ cast<FixedVectorType>(Vec->getType())->getNumElements();
+ SmallVector<int> Mask(VecVF, PoisonMaskElem);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ for (unsigned I : seq<unsigned>(Idx, Idx + SubVecVF))
+ Mask[I] = I - Idx + VecVF;
+ Vec = createShuffle(Vec, V, Mask);
+ }
+ if (!CommonMask.empty()) {
+ std::iota(std::next(CommonMask.be...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114284
More information about the llvm-commits
mailing list