[llvm] [SLP]Initial support for (masked)loads + compress and (masked)interleaved (PR #132099)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 13:58:22 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Added initial support for (masked)loads + compress and
(masked)interleaved loads.
---
Patch is 109.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132099.diff
14 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+325-34)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll (+7-10)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll (+6-16)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll (+6-10)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (+52-110)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (+52-110)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll (+5-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll (+28-24)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll (+5-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (+4-7)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll (+4-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (+4-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll (+3-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll (+5-6)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
Gather,
Vectorize,
ScatterVectorize,
- StridedVectorize
+ StridedVectorize,
+ MaskedLoadCompressVectorize
};
using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
Vectorize, ///< The node is regularly vectorized.
ScatterVectorize, ///< Masked scatter/gather node.
StridedVectorize, ///< Strided loads (and stores)
+ MaskedLoadCompressVectorize, ///< Masked load with compress.
NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
case StridedVectorize:
dbgs() << "StridedVectorize\n";
break;
+ case MaskedLoadCompressVectorize:
+ dbgs() << "MaskedLoadCompressVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->isGather())
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::StridedVectorize)
+ Entry->State == TreeEntry::StridedVectorize ||
+ Entry->State == TreeEntry::MaskedLoadCompressVectorize)
return "color=blue";
return "";
}
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, Type *ScalarTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<int> &CompressMask) {
+ const unsigned Sz = PointerOps.size();
+ CompressMask.assign(Sz, PoisonMaskElem);
+ // The first element always set.
+ CompressMask[0] = 0;
+ Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+ for (unsigned I : seq<unsigned>(1, Sz)) {
+ Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+ unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ CompressMask[I] = Pos;
+ }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+ ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+ const DominatorTree &DT, const TargetLibraryInfo &TLI,
+ const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+ unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+ VectorType *&LoadVecTy) {
+ InterleaveFactor = 0;
+ Type *ScalarTy = VL.front()->getType();
+ const unsigned Sz = VL.size();
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Check external uses.
+ for (const auto [I, V] : enumerate(VL)) {
+ if (AreAllUsersVectorized(V))
+ continue;
+ InstructionCost ExtractCost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ InstructionCost ScalarCost =
+ TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (ExtractCost <= ScalarCost)
+ return false;
+ }
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ std::optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ if (!Diff)
+ return false;
+ const unsigned MaxRegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ // Check for very large distances between elements.
+ if (*Diff / Sz >= MaxRegSize / 8)
+ return false;
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+ bool IsNotMasked = isSafeToLoadUnconditionally(
+ Ptr0, LoadVecTy, CommonAlignment, DL,
+ cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+ &TLI);
+ // TODO: perform the analysis of each scalar load for better
+ // safe-load-unconditionally analysis.
+ buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+ assert(CompressMask.size() >= 2 && "At least two elements are required");
+ IsMasked = !IsNotMasked;
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(cast<Instruction>(V),
+ CostKind);
+ }) +
+ ScalarGEPCost;
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ InstructionCost GatherCost =
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ InstructionCost LoadCost = 0;
+ if (IsNotMasked)
+ LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ else
+ LoadCost =
+ TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
+ if (int Interval = CompressMask[1] - CompressMask[0];
+ Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+ return static_cast<unsigned>(D.value()) == D.index() * Interval;
+ })) {
+ // Check for potential segmented(interleaved) loads.
+ if (TTI.isLegalInterleavedAccessType(
+ LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace())) {
+ InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, Interval, std::nullopt,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+ if (!Mask.empty())
+ InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ if (InterleavedCost < GatherCost) {
+ InterleaveFactor = Interval;
+ return true;
+ }
+ }
+ }
+ if (!Order.empty()) {
+ SmallVector<int> NewMask(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ NewMask[I] = CompressMask[Mask[I]];
+ }
+ CompressMask.swap(NewMask);
+ }
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+ InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+ return TotalVecCost < GatherCost;
+}
+
BoUpSLP::LoadsState
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == Sz - 1)
return LoadsState::Vectorize;
- if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
- TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::Gather;
// Simple check if not a strided access - clear order.
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
// Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
}
}
+ [[maybe_unused]] bool IsMasked;
+ [[maybe_unused]] unsigned InterleaveFactor;
+ [[maybe_unused]] SmallVector<int> CompressMask;
+ [[maybe_unused]] VectorType *LoadVecTy;;
+ if (isMaskedLoadCompress(
+ VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+ [&](Value *V) {
+ return areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ },
+ IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+ return LoadsState::MaskedLoadCompressVectorize;
}
+ if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+ TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::Gather;
// Correctly identify compare the cost of loads + shuffles rather than
// strided/masked gather loads. Returns true if vectorized + shuffles
// representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
// If need the reorder - consider as high-cost masked gather for now.
if ((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
+ LS == LoadsState::StridedVectorize ||
+ LS == LoadsState::MaskedLoadCompressVectorize) &&
!Order.empty() && !isReverseOrder(Order))
LS = LoadsState::ScatterVectorize;
States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
CommonAlignment, CostKind) +
VectorGEPCost;
break;
+ case LoadsState::MaskedLoadCompressVectorize:
+ VecLdCost += TTI.getMaskedMemoryOpCost(
+ Instruction::Load, SubVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind) +
+ VectorGEPCost +
+ ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+ {}, CostKind);
+ break;
case LoadsState::ScatterVectorize:
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::nullopt;
if (TE.State == TreeEntry::SplitVectorize ||
((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
+ TE.State == TreeEntry::StridedVectorize ||
+ TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
CurrentOrder, PointerOps);
- if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+ if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+ Res == LoadsState::MaskedLoadCompressVectorize)
return std::move(CurrentOrder);
}
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::SplitVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
if ((TE->State == TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty()) ||
((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) ||
(SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State ==
+ TreeEntry::MaskedLoadCompressVectorize ||
OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Queue.push(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedOps) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::StridedVectorize) {
+ Data.first->State == TreeEntry::StridedVectorize ||
+ Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// just exit.
unsigned ConsecutiveNodesSize = 0;
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- ArrayRef<Value *> VL =
- VectorizableTree[std::get<0>(P)]->Scalars;
- ConsecutiveNodesSize += VL.size();
- unsigned Start = std::distance(Slice.begin(), It);
- unsigned Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(std::distance(Slice.begin(), It),
- VL.size()) != VL;
- }))
+ any_of(
+ zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ LoadsState State =
+ canVectorizeLoads(VL, VL.front(), Order,
+ PointerOps);
+ if (State == LoadsState::ScatterVectorize||
+ State == LoadsState::MaskedLoadCompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntr...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/132099
More information about the llvm-commits
mailing list