[llvm] [SLP]Initial support for (masked)loads + compress and (masked)interleaved (PR #132099)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 21 11:43:51 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/132099
>From 1dbe03a331f296a95bb9c543d9b9c1df0ba7f926 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 19 Mar 2025 20:57:30 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 359 ++++++++++++++++--
.../X86/entries-shuffled-diff-sizes.ll | 17 +-
.../X86/gep-nodes-with-non-gep-inst.ll | 22 +-
.../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +-
.../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++-----
.../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++-----
.../X86/remark_gather-load-redux-cost.ll | 11 +-
.../X86/reorder-possible-strided-node.ll | 52 +--
.../X86/reorder-reused-masked-gather.ll | 12 +-
.../X86/reorder-reused-masked-gather2.ll | 11 +-
.../X86/scatter-vectorize-reused-pointer.ll | 12 +-
.../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +-
.../SLPVectorizer/X86/split-load8_2-unord.ll | 11 +-
.../X86/split-load8_2_unord_geps.ll | 11 +-
14 files changed, 506 insertions(+), 360 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
Gather,
Vectorize,
ScatterVectorize,
- StridedVectorize
+ StridedVectorize,
+ MaskedLoadCompressVectorize
};
using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
Vectorize, ///< The node is regularly vectorized.
ScatterVectorize, ///< Masked scatter/gather node.
StridedVectorize, ///< Strided loads (and stores)
+ MaskedLoadCompressVectorize, ///< Masked load with compress.
NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
case StridedVectorize:
dbgs() << "StridedVectorize\n";
break;
+ case MaskedLoadCompressVectorize:
+ dbgs() << "MaskedLoadCompressVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->isGather())
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::StridedVectorize)
+ Entry->State == TreeEntry::StridedVectorize ||
+ Entry->State == TreeEntry::MaskedLoadCompressVectorize)
return "color=blue";
return "";
}
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, Type *ScalarTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<int> &CompressMask) {
+ const unsigned Sz = PointerOps.size();
+ CompressMask.assign(Sz, PoisonMaskElem);
+ // The first element always set.
+ CompressMask[0] = 0;
+ Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+ for (unsigned I : seq<unsigned>(1, Sz)) {
+ Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+ unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ CompressMask[I] = Pos;
+ }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+ ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+ const DominatorTree &DT, const TargetLibraryInfo &TLI,
+ const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+ unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+ VectorType *&LoadVecTy) {
+ InterleaveFactor = 0;
+ Type *ScalarTy = VL.front()->getType();
+ const unsigned Sz = VL.size();
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Check external uses.
+ for (const auto [I, V] : enumerate(VL)) {
+ if (AreAllUsersVectorized(V))
+ continue;
+ InstructionCost ExtractCost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ InstructionCost ScalarCost =
+ TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (ExtractCost <= ScalarCost)
+ return false;
+ }
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ std::optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ if (!Diff)
+ return false;
+ const unsigned MaxRegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ // Check for very large distances between elements.
+ if (*Diff / Sz >= MaxRegSize / 8)
+ return false;
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+ bool IsNotMasked = isSafeToLoadUnconditionally(
+ Ptr0, LoadVecTy, CommonAlignment, DL,
+ cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+ &TLI);
+ // TODO: perform the analysis of each scalar load for better
+ // safe-load-unconditionally analysis.
+ buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+ assert(CompressMask.size() >= 2 && "At least two elements are required");
+ IsMasked = !IsNotMasked;
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(cast<Instruction>(V),
+ CostKind);
+ }) +
+ ScalarGEPCost;
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ InstructionCost GatherCost =
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ InstructionCost LoadCost = 0;
+ if (IsNotMasked)
+ LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ else
+ LoadCost =
+ TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
+ if (int Interval = CompressMask[1] - CompressMask[0];
+ Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+ return static_cast<unsigned>(D.value()) == D.index() * Interval;
+ })) {
+ // Check for potential segmented(interleaved) loads.
+ if (TTI.isLegalInterleavedAccessType(
+ LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace())) {
+ InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, Interval, std::nullopt,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+ if (!Mask.empty())
+ InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ if (InterleavedCost < GatherCost) {
+ InterleaveFactor = Interval;
+ return true;
+ }
+ }
+ }
+ if (!Order.empty()) {
+ SmallVector<int> NewMask(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ NewMask[I] = CompressMask[Mask[I]];
+ }
+ CompressMask.swap(NewMask);
+ }
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+ InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+ return TotalVecCost < GatherCost;
+}
+
BoUpSLP::LoadsState
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == Sz - 1)
return LoadsState::Vectorize;
- if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
- TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::Gather;
// Simple check if not a strided access - clear order.
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
// Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
}
}
+ [[maybe_unused]] bool IsMasked;
+ [[maybe_unused]] unsigned InterleaveFactor;
+ [[maybe_unused]] SmallVector<int> CompressMask;
+ [[maybe_unused]] VectorType *LoadVecTy;;
+ if (isMaskedLoadCompress(
+ VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+ [&](Value *V) {
+ return areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ },
+ IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+ return LoadsState::MaskedLoadCompressVectorize;
}
+ if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+ TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::Gather;
// Correctly identify compare the cost of loads + shuffles rather than
// strided/masked gather loads. Returns true if vectorized + shuffles
// representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
// If need the reorder - consider as high-cost masked gather for now.
if ((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
+ LS == LoadsState::StridedVectorize ||
+ LS == LoadsState::MaskedLoadCompressVectorize) &&
!Order.empty() && !isReverseOrder(Order))
LS = LoadsState::ScatterVectorize;
States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
CommonAlignment, CostKind) +
VectorGEPCost;
break;
+ case LoadsState::MaskedLoadCompressVectorize:
+ VecLdCost += TTI.getMaskedMemoryOpCost(
+ Instruction::Load, SubVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind) +
+ VectorGEPCost +
+ ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+ {}, CostKind);
+ break;
case LoadsState::ScatterVectorize:
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::nullopt;
if (TE.State == TreeEntry::SplitVectorize ||
((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
+ TE.State == TreeEntry::StridedVectorize ||
+ TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
CurrentOrder, PointerOps);
- if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+ if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+ Res == LoadsState::MaskedLoadCompressVectorize)
return std::move(CurrentOrder);
}
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::SplitVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
if ((TE->State == TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty()) ||
((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) ||
(SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State ==
+ TreeEntry::MaskedLoadCompressVectorize ||
OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Queue.push(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedOps) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::StridedVectorize) {
+ Data.first->State == TreeEntry::StridedVectorize ||
+ Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// just exit.
unsigned ConsecutiveNodesSize = 0;
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- ArrayRef<Value *> VL =
- VectorizableTree[std::get<0>(P)]->Scalars;
- ConsecutiveNodesSize += VL.size();
- unsigned Start = std::distance(Slice.begin(), It);
- unsigned Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(std::distance(Slice.begin(), It),
- VL.size()) != VL;
- }))
+ any_of(
+ zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ LoadsState State =
+ canVectorizeLoads(VL, VL.front(), Order,
+ PointerOps);
+ if (State == LoadsState::ScatterVectorize||
+ State == LoadsState::MaskedLoadCompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
+ case LoadsState::MaskedLoadCompressVectorize:
+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ // Delay slow vectorized nodes for better vectorization attempts.
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
+ return TreeEntry::NeedToGather;
+ }
+ return TreeEntry::MaskedLoadCompressVectorize;
case LoadsState::ScatterVectorize:
if (!IsGraphTransformMode && !VectorizableTree.empty()) {
// Delay slow vectorized nodes for better vectorization attempts.
@@ -9268,6 +9464,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
<< "SLP: added a new TreeEntry (jumbled LoadInst).\n";
TE->dump());
break;
+ case TreeEntry::MaskedLoadCompressVectorize:
+ // Vectorizing non-consecutive loads with (masked)load + compress.
+ TE = newTreeEntry(VL, TreeEntry::MaskedLoadCompressVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
+ TE->dump());
+ break;
case TreeEntry::StridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
@@ -11791,6 +11996,8 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
if (TE.State == TreeEntry::ScatterVectorize ||
TE.State == TreeEntry::StridedVectorize)
return TTI::CastContextHint::GatherScatter;
+ if (TE.State == TreeEntry::MaskedLoadCompressVectorize)
+ return TTI::CastContextHint::Masked;
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
!TE.isAltShuffle()) {
if (TE.ReorderIndices.empty())
@@ -11884,7 +12091,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize ||
+ E->State == TreeEntry::MaskedLoadCompressVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -11975,8 +12183,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::StridedVectorize) &&
- "Entry state expected to be Vectorize or StridedVectorize here.");
+ E->State == TreeEntry::StridedVectorize ||
+ E->State == TreeEntry::MaskedLoadCompressVectorize) &&
+ "Entry state expected to be Vectorize, StridedVectorize or "
+ "MaskedLoadCompressVectorize here.");
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -12439,6 +12649,46 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
/*VariableMask=*/false, CommonAlignment, CostKind);
break;
}
+ case TreeEntry::MaskedLoadCompressVectorize: {
+ SmallVector<Value *> PointerOps(VL.size());
+ for (auto [I, V] : enumerate(VL))
+ PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
+ bool IsMasked;
+ unsigned InterleaveFactor;
+ SmallVector<int> CompressMask;
+ VectorType *LoadVecTy;
+ [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
+ VL, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, *TLI,
+ [](Value *) { return true; }, IsMasked, InterleaveFactor,
+ CompressMask, LoadVecTy);
+ assert(IsVectorized && "Expected to be vectorized");
+ Align CommonAlignment;
+ if (IsMasked)
+ CommonAlignment = computeCommonAlignment<LoadInst>(
+ UniqueValues.getArrayRef());
+ else
+ CommonAlignment = LI0->getAlign();
+ if (InterleaveFactor) {
+ VecLdCost = TTI->getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt,
+ CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
+ } else if (IsMasked) {
+ VecLdCost = TTI->getMaskedMemoryOpCost(
+ Instruction::Load, LoadVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind);
+ // TODO: include this cost into CommonCost.
+ VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ } else {
+ VecLdCost = TTI->getMemoryOpCost(
+ Instruction::Load, LoadVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ // TODO: include this cost into CommonCost.
+ VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ }
+ break;
+ }
case TreeEntry::ScatterVectorize: {
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
@@ -12728,6 +12978,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
if (VectorizableTree.size() == 1 &&
(VectorizableTree[0]->State == TreeEntry::Vectorize ||
VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
+ VectorizableTree[0]->State == TreeEntry::MaskedLoadCompressVectorize ||
(ForReduction &&
AreVectorizableGathers(VectorizableTree[0].get(),
VectorizableTree[0]->Scalars.size()) &&
@@ -12751,7 +13002,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
if (VectorizableTree[0]->isGather() ||
(VectorizableTree[1]->isGather() &&
VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
- VectorizableTree[0]->State != TreeEntry::StridedVectorize))
+ VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
+ VectorizableTree[0]->State != TreeEntry::MaskedLoadCompressVectorize))
return false;
return true;
@@ -16867,6 +17119,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+ } else if (E->State == TreeEntry::MaskedLoadCompressVectorize) {
+ SmallVector<Value *> PointerOps(E->Scalars.size());
+ for (auto [I, V] : enumerate(E->Scalars))
+ PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
+ bool IsMasked;
+ unsigned InterleaveFactor;
+ SmallVector<int> CompressMask;
+ VectorType *LoadVecTy;
+ [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
+ E->Scalars, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT,
+ *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
+ CompressMask, LoadVecTy);
+ assert(IsVectorized && "Expected to be vectorized");
+ Align CommonAlignment;
+ if (IsMasked)
+ CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+ else
+ CommonAlignment = LI->getAlign();
+ if (IsMasked) {
+ SmallVector<Constant *> MaskValues(
+ getNumElements(LoadVecTy) / getNumElements(LI->getType()),
+ ConstantInt::getFalse(VecTy->getContext()));
+ for (int I : CompressMask)
+ MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
+ Constant *MaskValue = ConstantVector::get(MaskValues);
+ NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
+ MaskValue);
+ } else {
+ NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
+ }
+ NewLI = ::propagateMetadata(NewLI, E->Scalars);
+ // TODO: include this cost into CommonCost.
+ NewLI =
+ cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
} else if (E->State == TreeEntry::StridedVectorize) {
Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
@@ -16936,7 +17222,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
}
- Value *V = ::propagateMetadata(NewLI, E->Scalars);
+ Value *V = E->State == TreeEntry::MaskedLoadCompressVectorize
+ ? NewLI
+ : ::propagateMetadata(NewLI, E->Scalars);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
@@ -17538,11 +17826,14 @@ Value *BoUpSLP::vectorizeTree(
ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
return !UseEntries.empty() &&
(E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize ||
+ E->State == TreeEntry::MaskedLoadCompressVectorize) &&
any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
return (UseEntry->State == TreeEntry::Vectorize ||
UseEntry->State ==
- TreeEntry::StridedVectorize) &&
+ TreeEntry::StridedVectorize ||
+ UseEntry->State ==
+ TreeEntry::MaskedLoadCompressVectorize) &&
doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry),
TLI, TTI);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
index aa9195f8c48ce..b99a1c2d83394 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
@@ -15,19 +15,16 @@ define void @test() {
; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
-; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16
-; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16
-; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> <i32 poison, i32 0, i32 2, i32 1, i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> <i32 poison, i32 0, i32 20, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 18, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19>
-; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8)
-; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0)
-; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6)
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19, i32 18>
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18>
; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]]
-; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
; CHECK-NEXT: ret void
;
alloca_0:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 12263b065d89c..80ba7a40fb193 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -9,17 +9,9 @@ define void @test() {
; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8
-; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0
-; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4
-; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8
-; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0
-; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8
-; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8
-; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0
-; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12
-; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8
-; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT: ret void
;
; CHECK-SLP-THRESHOLD-LABEL: define void @test
@@ -28,11 +20,9 @@ define void @test() {
; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]]
; CHECK-SLP-THRESHOLD: bb:
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison)
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
+; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
; CHECK-SLP-THRESHOLD-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index f249394c91788..a9c0eb3f9f2b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -24,20 +24,16 @@ define void @foo() {
; SSE-NEXT: ret void
;
; AVX-LABEL: @foo(
-; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16
-; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8
-; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
-; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo(
-; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16
-; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8
-; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
-; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16
; AVX512-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 925c334cb5f20..a0e52c13ec621 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
;
; AVX512F-LABEL: @gather_load_2(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
-; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
+; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
+; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
-; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
+; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
+; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
; AVX-NEXT: ret void
;
; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
-; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
-; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
-; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
-; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
-; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
+; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
+; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
+; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = load i32, ptr %1, align 4, !tbaa !2
@@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
; AVX-NEXT: ret void
;
; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
-; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
+; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
+; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
+; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%t5 = getelementptr inbounds i32, ptr %t0, i64 1
@@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
+; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
-; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
+; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
-; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = load float, ptr %1, align 4, !tbaa !2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index dc1ba4ec7e7ab..6c5638819dcea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
;
; AVX512F-LABEL: @gather_load_2(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
-; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
+; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
+; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
-; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
-; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
+; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
+; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
; AVX-NEXT: ret void
;
; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
-; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
-; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
-; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
-; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
-; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
+; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
+; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
+; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = load i32, ptr %1, align 4, !tbaa !2
@@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
; AVX-NEXT: ret void
;
; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
-; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
+; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
+; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
+; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> <i64 0, i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21>
-; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%t5 = getelementptr inbounds i32, ptr %t0, i64 1
@@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
+; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
-; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
+; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
-; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = load float, ptr %1, align 4, !tbaa !2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 0807a1bd4cdea..bbb1b87fc3dfa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -5,10 +5,9 @@
define i32 @test(ptr noalias %p, ptr noalias %addr) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 3, i32 1, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
+; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]]
@@ -22,9 +21,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; YAML-NEXT: Function: test
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
- ; YAML-NEXT: - Cost: '-1'
+ ; YAML-NEXT: - Cost: '-10'
; YAML-NEXT: - String: ' and with tree size '
- ; YAML-NEXT: - TreeSize: '8'
+ ; YAML-NEXT: - TreeSize: '5'
entry:
%off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
%idx0 = load i32, ptr %off0.1, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 5bd954e741d43..02058b1fe8578 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -5,16 +5,17 @@ define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0)
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -64,16 +65,17 @@ define void @test1() {
; CHECK-LABEL: define void @test1(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0)
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
@@ -125,16 +127,17 @@ define void @test_div() {
; CHECK-LABEL: define void @test_div(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0)
; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], <i32 2, i32 1, i32 2, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -184,16 +187,17 @@ define void @test_rem() {
; CHECK-LABEL: define void @test_rem(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0)
; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], <i32 1, i32 1, i32 2, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index c7c67d31f9ded..73b6c80730935 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -3,14 +3,12 @@
define void @test(ptr noalias %0, ptr %p) {
; CHECK-LABEL: @test(
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> <i64 15, i64 4, i64 5, i64 0, i64 2, i64 6, i64 7, i64 8>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
-; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison)
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 24, i32 0, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 15>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index c114c5dee78e9..92d5506977aeb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -8,14 +8,11 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 5, i32 4>
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0)
-; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2)
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> <i32 0, i32 4, i32 0, i32 4, i32 5, i32 1, i32 5, i32 1>
; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index 1294a87ff6967..d487e3616956c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -5,16 +5,12 @@ define void @test(i1 %c, ptr %arg) {
; CHECK-LABEL: @test(
; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
; CHECK: if:
-; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
; CHECK-NEXT: br label [[JOIN:%.*]]
; CHECK: else:
-; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
-; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
; CHECK-NEXT: br label [[JOIN]]
; CHECK: join:
; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
index e1e80d96d416d..b4996eb58b47e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -19,11 +19,11 @@ define void @test() {
; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> <i32 0, i32 4>
; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> <i32 1, i32 5>
; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 202ec9633712f..3f684e414c8ba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -8,15 +8,10 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
-; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
-; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> <i32 1, i32 7, i32 6, i32 4, i32 poison, i32 poison, i32 0, i32 5>
-; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4)
+; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <12 x i32> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> <i32 11, i32 3, i32 2, i32 0, i32 8, i32 9, i32 10, i32 1>
; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]]
; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
index 8fe7d15b69cb1..fdc0bc0e00eb8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
@@ -4,16 +4,15 @@
define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
+; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
-; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]]
>From 94a4643839336ecdf90ad523deb6bf1936cfc663 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 19 Mar 2025 21:37:21 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 63 +++++++++----------
1 file changed, 31 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f9905cc7c3307..d6f1e9b0f8033 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3377,11 +3377,11 @@ class BoUpSLP {
/// (either with vector instruction or with scatter/gather
/// intrinsics for store/load)?
enum EntryState {
- Vectorize, ///< The node is regularly vectorized.
- ScatterVectorize, ///< Masked scatter/gather node.
- StridedVectorize, ///< Strided loads (and stores)
+ Vectorize, ///< The node is regularly vectorized.
+ ScatterVectorize, ///< Masked scatter/gather node.
+ StridedVectorize, ///< Strided loads (and stores)
MaskedLoadCompressVectorize, ///< Masked load with compress.
- NeedToGather, ///< Gather/buildvector node.
+ NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
@@ -5489,7 +5489,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
[[maybe_unused]] bool IsMasked;
[[maybe_unused]] unsigned InterleaveFactor;
[[maybe_unused]] SmallVector<int> CompressMask;
- [[maybe_unused]] VectorType *LoadVecTy;;
+ [[maybe_unused]] VectorType *LoadVecTy;
if (isMaskedLoadCompress(
VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
[&](Value *V) {
@@ -7902,31 +7902,30 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// just exit.
unsigned ConsecutiveNodesSize = 0;
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(
- zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
- ArrayRef<Value *> VL = TE.Scalars;
- OrdersType Order;
- SmallVector<Value *> PointerOps;
- LoadsState State =
- canVectorizeLoads(VL, VL.front(), Order,
- PointerOps);
- if (State == LoadsState::ScatterVectorize||
- State == LoadsState::MaskedLoadCompressVectorize)
- return false;
- ConsecutiveNodesSize += VL.size();
- unsigned Start = std::distance(Slice.begin(), It);
- unsigned Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(std::distance(Slice.begin(), It),
- VL.size()) != VL;
- }))
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE =
+ *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ LoadsState State = canVectorizeLoads(
+ VL, VL.front(), Order, PointerOps);
+ if (State == LoadsState::ScatterVectorize ||
+ State == LoadsState::MaskedLoadCompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
@@ -12664,8 +12663,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
assert(IsVectorized && "Expected to be vectorized");
Align CommonAlignment;
if (IsMasked)
- CommonAlignment = computeCommonAlignment<LoadInst>(
- UniqueValues.getArrayRef());
+ CommonAlignment =
+ computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
else
CommonAlignment = LI0->getAlign();
if (InterleaveFactor) {
More information about the llvm-commits
mailing list