[llvm] 0940f90 - [SLP]Add support for strided loads.
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 12 04:41:47 PST 2024
Author: Alexey Bataev
Date: 2024-02-12T07:41:42-05:00
New Revision: 0940f9083e68bda78bcbb323c2968a4294092e21
URL: https://github.com/llvm/llvm-project/commit/0940f9083e68bda78bcbb323c2968a4294092e21
DIFF: https://github.com/llvm/llvm-project/commit/0940f9083e68bda78bcbb323c2968a4294092e21.diff
LOG: [SLP]Add support for strided loads.
Added basic support for strided loads support in SLP vectorizer.
Supports constant strides only. If the strided load must be
reversed, applies -stride to avoid extra reverse shuffle.
Reviewers: preames, lukel97
Reviewed By: preames
Pull Request: https://github.com/llvm/llvm-project/pull/80310
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0b7298f78005d..c94fb71ab220ba 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,6 +175,15 @@ static cl::opt<int> RootLookAheadMaxDepth(
"slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for searching best rooting option"));
+static cl::opt<unsigned> MinProfitableStridedLoads(
+ "slp-min-strided-loads", cl::init(2), cl::Hidden,
+ cl::desc("The minimum number of loads, which should be considered strided, "
+ "if the stride is > 1 or is runtime value"));
+
+static cl::opt<unsigned> MaxProfitableLoadStride(
+ "slp-max-stride", cl::init(8), cl::Hidden,
+ cl::desc("The maximum stride, considered to be profitable."));
+
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
@@ -2575,7 +2584,7 @@ class BoUpSLP {
enum EntryState {
Vectorize,
ScatterVectorize,
- PossibleStridedVectorize,
+ StridedVectorize,
NeedToGather
};
EntryState State;
@@ -2753,8 +2762,8 @@ class BoUpSLP {
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
- case PossibleStridedVectorize:
- dbgs() << "PossibleStridedVectorize\n";
+ case StridedVectorize:
+ dbgs() << "StridedVectorize\n";
break;
case NeedToGather:
dbgs() << "NeedToGather\n";
@@ -3680,7 +3689,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::PossibleStridedVectorize)
+ Entry->State == TreeEntry::StridedVectorize)
return "color=blue";
return "";
}
@@ -3842,12 +3851,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
namespace {
/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState {
- Gather,
- Vectorize,
- ScatterVectorize,
- PossibleStridedVectorize
-};
+enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
} // anonymous namespace
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3878,6 +3882,14 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
return CommonAlignment;
}
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef<unsigned> Order) {
+ unsigned Sz = Order.size();
+ return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+ return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+ });
+}
+
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3900,7 +3912,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
PointerOps.clear();
- PointerOps.resize(VL.size());
+ const unsigned Sz = VL.size();
+ PointerOps.resize(Sz);
auto *POIter = PointerOps.begin();
for (Value *V : VL) {
auto *L = cast<LoadInst>(V);
@@ -3911,12 +3924,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
Order.clear();
+ auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
- bool IsPossibleStrided = false;
if (IsSorted) {
Value *Ptr0;
Value *PtrN;
@@ -3930,30 +3943,71 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
std::optional<int> Diff =
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
// Check that the sorted loads are consecutive.
- if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+ if (static_cast<unsigned>(*Diff) == Sz - 1)
return LoadsState::Vectorize;
// Simple check if not a strided access - clear order.
- IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+ bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+ // Try to generate strided load node if:
+ // 1. Target with strided load support is detected.
+ // 2. The number of loads is greater than MinProfitableStridedLoads,
+ // or the potential stride <= MaxProfitableLoadStride and the
+ // potential stride is power-of-2 (to avoid perf regressions for the very
+ // small number of loads) and max distance > number of loads, or potential
+ // stride is -1.
+ // 3. The loads are ordered, or number of unordered loads <=
+ // MaxProfitableUnorderedLoads, or loads are in reversed order.
+ // (this check is to avoid extra costs for very expensive shuffles).
+ if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+ (static_cast<unsigned>(std::abs(*Diff)) <=
+ MaxProfitableLoadStride * Sz &&
+ isPowerOf2_32(std::abs(*Diff)))) &&
+ static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
+ *Diff == -(static_cast<int>(Sz) - 1))) {
+ int Stride = *Diff / static_cast<int>(Sz - 1);
+ if (*Diff == Stride * static_cast<int>(Sz - 1)) {
+ Align Alignment =
+ cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+ ->getAlign();
+ if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
+ // Iterate through all pointers and check if all distances are
+ // unique multiple of Dist.
+ SmallSet<int, 4> Dists;
+ for (Value *Ptr : PointerOps) {
+ int Dist = 0;
+ if (Ptr == PtrN)
+ Dist = *Diff;
+ else if (Ptr != Ptr0)
+ Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ // If the strides are not the same or repeated, we can't
+ // vectorize.
+ if (((Dist / Stride) * Stride) != Dist ||
+ !Dists.insert(Dist).second)
+ break;
+ }
+ if (Dists.size() == Sz)
+ return LoadsState::StridedVectorize;
+ }
+ }
+ }
}
// TODO: need to improve analysis of the pointers, if not all of them are
// GEPs or have > 2 operands, we end up with a gather node, which just
// increases the cost.
Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
bool ProfitableGatherPointers =
- static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
- return L && L->isLoopInvariant(V);
- })) <= VL.size() / 2 && VL.size() > 2;
+ static_cast<unsigned>(count_if(
+ PointerOps,
+ [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
+ Sz > 2;
if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
(GEP && GEP->getNumOperands() == 2);
})) {
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
- : LoadsState::ScatterVectorize;
+ return LoadsState::ScatterVectorize;
}
}
@@ -4160,7 +4214,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::move(ResOrder);
}
if ((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::PossibleStridedVectorize) &&
+ TE.State == TreeEntry::StridedVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
!TE.isAltShuffle())
@@ -4418,7 +4472,7 @@ void BoUpSLP::reorderTopToBottom() {
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::PossibleStridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -4442,9 +4496,6 @@ void BoUpSLP::reorderTopToBottom() {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
- // Last chance orders - scatter vectorize. Try to use their orders if no
- // other orders or the order is counted already.
- SmallVector<OrdersType> StridedVectorizeOrders;
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4491,11 +4542,6 @@ void BoUpSLP::reorderTopToBottom() {
if (Order.empty())
continue;
}
- // Postpone scatter orders.
- if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
- StridedVectorizeOrders.push_back(Order);
- continue;
- }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4512,22 +4558,6 @@ void BoUpSLP::reorderTopToBottom() {
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
}
}
- // Set order of the user node.
- if (OrdersUses.empty()) {
- if (StridedVectorizeOrders.empty())
- continue;
- // Add (potentially!) strided vectorize orders.
- for (OrdersType &Order : StridedVectorizeOrders)
- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
- } else {
- // Account (potentially!) strided vectorize orders only if it was used
- // already.
- for (OrdersType &Order : StridedVectorizeOrders) {
- auto *It = OrdersUses.find(Order);
- if (It != OrdersUses.end())
- ++It->second;
- }
- }
// Choose the most used order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4569,7 +4599,7 @@ void BoUpSLP::reorderTopToBottom() {
continue;
}
if ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::PossibleStridedVectorize) &&
+ TE->State == TreeEntry::StridedVectorize) &&
isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
@@ -4610,10 +4640,6 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
- // FIXME: Do not reorder (possible!) strided vectorized nodes, they
- // require reordering of the operands, which is not implemented yet.
- if (TE->State == TreeEntry::PossibleStridedVectorize)
- return false;
// Do not reorder if operand node is used by many user nodes.
if (any_of(TE->UserTreeIndices,
[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4664,13 +4690,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> NonVectorized;
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::PossibleStridedVectorize)
+ TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::PossibleStridedVectorize) ||
+ TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
@@ -4688,7 +4714,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::PossibleStridedVectorize ||
+ TE->State == TreeEntry::StridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
GathersToOrders.count(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4733,9 +4759,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
- // Last chance orders - scatter vectorize. Try to use their orders if no
- // other orders or the order is counted already.
- SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
// Do the analysis for each tree entry only once, otherwise the order of
// the same node my be considered several times, though might be not
// profitable.
@@ -4757,11 +4780,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
});
- // Postpone scatter orders.
- if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
- StridedVectorizeOrders.emplace_back(Order, NumOps);
- continue;
- }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4819,30 +4837,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
++Res.first->second;
}
}
- // If no orders - skip current nodes and jump to the next one, if any.
- if (OrdersUses.empty()) {
- if (StridedVectorizeOrders.empty() ||
- (Data.first->ReorderIndices.empty() &&
- Data.first->ReuseShuffleIndices.empty() &&
- !(IgnoreReorder &&
- Data.first == VectorizableTree.front().get()))) {
- for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
- OrderedEntries.remove(Op.second);
- continue;
- }
- // Add (potentially!) strided vectorize orders.
- for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
- OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
- Pair.second;
- } else {
- // Account (potentially!) strided vectorize orders only if it was used
- // already.
- for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
- auto *It = OrdersUses.find(Pair.first);
- if (It != OrdersUses.end())
- It->second += Pair.second;
- }
- }
// Choose the best order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4878,7 +4872,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::PossibleStridedVectorize &&
+ TE->State != TreeEntry::StridedVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
continue;
@@ -4910,7 +4904,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::PossibleStridedVectorize) {
+ Data.first->State == TreeEntry::StridedVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -4973,7 +4967,6 @@ void BoUpSLP::buildExternalUses(
// instructions. If that is the case, the one in FoundLane will
// be used.
if (UseEntry->State == TreeEntry::ScatterVectorize ||
- UseEntry->State == TreeEntry::PossibleStridedVectorize ||
!doesInTreeUserNeedToExtract(
Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5331,8 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
return TreeEntry::ScatterVectorize;
- case LoadsState::PossibleStridedVectorize:
- return TreeEntry::PossibleStridedVectorize;
+ case LoadsState::StridedVectorize:
+ return TreeEntry::StridedVectorize;
case LoadsState::Gather:
#ifndef NDEBUG
Type *ScalarTy = VL0->getType();
@@ -5753,8 +5746,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BasicBlock *BB = nullptr;
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
- (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
- UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
bool AreAllSameInsts =
(S.getOpcode() && allSameBlock(VL)) ||
(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5851,8 +5843,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Special processing for sorted pointers for ScatterVectorize node with
// constant indeces only.
if (AreAllSameInsts && UserTreeIdx.UserTE &&
- (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
- UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
+ UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
!(S.getOpcode() && allSameBlock(VL))) {
assert(S.OpValue->getType()->isPointerTy() &&
count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6049,18 +6040,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
- case TreeEntry::PossibleStridedVectorize:
+ case TreeEntry::StridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
if (CurrentOrder.empty()) {
- TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies);
} else {
- TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+ TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
}
TE->setOperandsInOrder();
- buildTree_rec(PointerOps, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7091,7 +7081,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
!isSplat(Gathers)) {
InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
SetVector<Value *> VectorizedLoads;
- SmallVector<unsigned> VectorizedStarts;
+ SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
SmallVector<unsigned> ScatterVectorized;
unsigned StartIdx = 0;
unsigned VF = VL.size() / 2;
@@ -7115,12 +7105,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
- case LoadsState::PossibleStridedVectorize:
+ case LoadsState::StridedVectorize:
// Mark the vectorized loads so that we don't vectorize them
// again.
// TODO: better handling of loads with reorders.
- if (LS == LoadsState::Vectorize && CurrentOrder.empty())
- VectorizedStarts.push_back(Cnt);
+ if (((LS == LoadsState::Vectorize ||
+ LS == LoadsState::StridedVectorize) &&
+ CurrentOrder.empty()) ||
+ (LS == LoadsState::StridedVectorize &&
+ isReverseOrder(CurrentOrder)))
+ VectorizedStarts.emplace_back(Cnt, LS);
else
ScatterVectorized.push_back(Cnt);
VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7164,16 +7158,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
CostKind, TTI::OperandValueInfo(), LI);
}
auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
- for (unsigned P : VectorizedStarts) {
- auto *LI = cast<LoadInst>(VL[P]);
+ for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
+ auto *LI = cast<LoadInst>(VL[P.first]);
Align Alignment = LI->getAlign();
GatherCost +=
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
+ P.second == LoadsState::Vectorize
+ ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI)
+ : TTI.getStridedMemoryOpCost(
+ Instruction::Load, LoadTy, LI->getPointerOperand(),
+ /*VariableMask=*/false, Alignment, CostKind, LI);
// Estimate GEP cost.
SmallVector<Value *> PointerOps(VF);
- for (auto [I, V] : enumerate(VL.slice(P, VF)))
+ for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7913,8 +7911,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
+ bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
- E->State != TreeEntry::PossibleStridedVectorize) {
+ (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -7932,7 +7931,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::PossibleStridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -7952,7 +7951,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
auto GetCastContextHint = [&](Value *V) {
if (const TreeEntry *OpTE = getTreeEntry(V)) {
- if (OpTE->State == TreeEntry::ScatterVectorize)
+ if (OpTE->State == TreeEntry::ScatterVectorize ||
+ OpTE->State == TreeEntry::StridedVectorize)
return TTI::CastContextHint::GatherScatter;
if (OpTE->State == TreeEntry::Vectorize &&
OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,8 +8028,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Calculate cost
diff erence from vectorizing set of GEPs.
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
- assert(E->State == TreeEntry::Vectorize &&
- "Entry state expected to be Vectorize here.");
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::StridedVectorize) &&
+ "Entry state expected to be Vectorize or StridedVectorize here.");
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8382,10 +8383,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecLdCost = TTI->getMemoryOpCost(
Instruction::Load, VecTy, LI0->getAlign(),
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ } else if (E->State == TreeEntry::StridedVectorize) {
+ Align CommonAlignment =
+ computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
+ VecLdCost = TTI->getStridedMemoryOpCost(
+ Instruction::Load, VecTy, LI0->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind);
} else {
- assert((E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::PossibleStridedVectorize) &&
- "Unknown EntryState");
+ assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8398,8 +8403,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
// If this node generates masked gather load then it is not a terminal node.
// Hence address operand cost is estimated separately.
- if (E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::PossibleStridedVectorize)
+ if (E->State == TreeEntry::ScatterVectorize)
return Cost;
// Estimate cost of GEPs since this tree node is a terminator.
@@ -8608,7 +8612,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
- VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
+ VectorizableTree[0]->State != TreeEntry::StridedVectorize))
return false;
return true;
@@ -10579,11 +10583,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
bool PostponedPHIs) {
ValueList &VL = E->getOperand(NodeIdx);
- if (E->State == TreeEntry::PossibleStridedVectorize &&
- !E->ReorderIndices.empty()) {
- SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
- reorderScalars(VL, Mask);
- }
const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
@@ -11157,6 +11156,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return Vec;
}
+ bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
bool IsSigned) {
if (V->getType() != VecTy)
@@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
ShuffleBuilder.add(V, Mask);
- } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+ } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
ShuffleBuilder.addOrdered(V, std::nullopt);
} else {
ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::PossibleStridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize) &&
"Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,10 +11642,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+ } else if (E->State == TreeEntry::StridedVectorize) {
+ Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+ Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+ PO = IsReverseOrder ? PtrN : Ptr0;
+ std::optional<int> Diff = getPointersDiff(
+ VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
+ Type *StrideTy = DL->getIndexType(PO->getType());
+ int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
+ Value *StrideVal =
+ ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
+ DL->getTypeAllocSize(ScalarTy));
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+ auto *Inst = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load,
+ {VecTy, PO->getType(), StrideTy},
+ {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
+ Builder.getInt32(E->Scalars.size())});
+ Inst->addParamAttr(
+ /*ArgNo=*/0,
+ Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+ NewLI = Inst;
} else {
- assert((E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::PossibleStridedVectorize) &&
- "Unhandled state");
+ assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12069,8 +12088,11 @@ Value *BoUpSLP::vectorizeTree(
[&](llvm::User *U) {
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
- UseEntry->State == TreeEntry::Vectorize &&
- E->State == TreeEntry::Vectorize &&
+ (UseEntry->State == TreeEntry::Vectorize ||
+ UseEntry->State ==
+ TreeEntry::StridedVectorize) &&
+ (E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::StridedVectorize) &&
doesInTreeUserNeedToExtract(
Scalar,
cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index dc5fb917886347..e167b6a47af592 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
; CHECK-LABEL: define i32 @test(
@@ -67,305 +67,303 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
+; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
+; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT: [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
-; CHECK-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
-; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
-; CHECK-NEXT: [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
-; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; CHECK-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
-; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
-; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT: [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
-; CHECK-NEXT: [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
-; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
-; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
-; CHECK-NEXT: [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
-; CHECK-NEXT: [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
-; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
-; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
-; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
-; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
-; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
-; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT: [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
+; CHECK-NEXT: [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
+; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT: [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
+; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
+; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
+; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
+; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
+; CHECK-NEXT: [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
+; CHECK-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
+; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
+; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
+; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
+; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
+; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
+; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
-; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
-; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
-; CHECK-NEXT: [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
-; CHECK-NEXT: [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT: [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
-; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
-; CHECK-NEXT: [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
-; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
-; CHECK-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
-; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
-; CHECK-NEXT: [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
-; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
-; CHECK-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
-; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
-; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
-; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
-; CHECK-NEXT: [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
-; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
-; CHECK-NEXT: [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
-; CHECK-NEXT: [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
-; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
-; CHECK-NEXT: [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT: [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
+; CHECK-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT: [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
+; CHECK-NEXT: [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
+; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT: [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
+; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
+; CHECK-NEXT: [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
+; CHECK-NEXT: [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
+; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
+; CHECK-NEXT: [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
+; CHECK-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
+; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
+; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
+; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
+; CHECK-NEXT: [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
+; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
+; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
+; CHECK-NEXT: [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
+; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
+; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; CHECK-NEXT: [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT: [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT: [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT: [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
-; CHECK-NEXT: [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
-; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
-; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
-; CHECK-NEXT: [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT: [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
-; CHECK-NEXT: [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
-; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
-; CHECK-NEXT: [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
-; CHECK-NEXT: [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
-; CHECK-NEXT: [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
-; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
-; CHECK-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
-; CHECK-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
-; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
-; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
-; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
-; CHECK-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
-; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
-; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
+; CHECK-NEXT: [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT: [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT: [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
+; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
+; CHECK-NEXT: [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT: [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; CHECK-NEXT: [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
+; CHECK-NEXT: [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
+; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
+; CHECK-NEXT: [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT: [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
+; CHECK-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
+; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
+; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
+; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
+; CHECK-NEXT: [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
+; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
+; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
+; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
+; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
+; CHECK-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
+; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
-; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
-; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT: [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT: [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
-; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT: [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT: [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT: [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT: [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT: [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
-; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
-; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
-; CHECK-NEXT: [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
-; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
-; CHECK-NEXT: [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
-; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
-; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
-; CHECK-NEXT: [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
+; CHECK-NEXT: [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
+; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
+; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT: [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT: [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
+; CHECK-NEXT: [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT: [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
+; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT: [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
+; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
+; CHECK-NEXT: [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
+; CHECK-NEXT: [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
+; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
+; CHECK-NEXT: [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT: [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
-; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
+; CHECK-NEXT: [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
+; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
+; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
+; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT: [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
-; CHECK-NEXT: [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT: [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
-; CHECK-NEXT: [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
-; CHECK-NEXT: [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
-; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
-; CHECK-NEXT: [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT: [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT: [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
-; CHECK-NEXT: [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
-; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
-; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
-; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
-; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
-; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
-; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
-; CHECK-NEXT: [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
-; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
-; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
-; CHECK-NEXT: [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT: [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT: [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
-; CHECK-NEXT: [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
-; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
-; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
-; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
+; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
+; CHECK-NEXT: [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT: [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT: [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
+; CHECK-NEXT: [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
+; CHECK-NEXT: [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
+; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
+; CHECK-NEXT: [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT: [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT: [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT: [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
+; CHECK-NEXT: [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
+; CHECK-NEXT: [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
+; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
+; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
+; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
+; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
+; CHECK-NEXT: [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT: [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
+; CHECK-NEXT: [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
+; CHECK-NEXT: [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
+; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
+; CHECK-NEXT: [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT: [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
+; CHECK-NEXT: [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
+; CHECK-NEXT: [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
+; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
+; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
-; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
-; CHECK-NEXT: [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
+; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
+; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
+; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
+; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT: [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT: [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
-; CHECK-NEXT: [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
-; CHECK-NEXT: [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
-; CHECK-NEXT: [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
-; CHECK-NEXT: [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
-; CHECK-NEXT: [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
-; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
-; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
-; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
-; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
-; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
+; CHECK-NEXT: [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT: [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT: [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
+; CHECK-NEXT: [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
+; CHECK-NEXT: [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
+; CHECK-NEXT: [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
+; CHECK-NEXT: [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
+; CHECK-NEXT: [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
+; CHECK-NEXT: [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
+; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
+; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
+; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
+; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
+; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index a4cc311d12a217..4b0b41970bbb4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,61 +5,12 @@ define void @test(ptr %p, ptr noalias %s) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
-; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
-; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
-; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
-; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
-; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -262,67 +213,40 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
-; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
+; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -397,27 +321,12 @@ define void @test3(ptr %p, ptr noalias %s) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
-; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT: store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 5aba9ea115a4b9..ec152c707eec6b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]]
; CHECK: while.body.i:
; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
; CHECK-NEXT: br label [[WHILE_BODY_I]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8f2c72bb4c6856..8ab57cc73e646f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
; CHECK-LABEL: define i32 @sum_of_abs
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
-; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT: ret i32 [[TMP6]]
+; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
+; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT: ret i32 [[TMP3]]
;
entry:
%0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 96d4c307f1c67f..9e43cefef2801d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
; CHECK-SLP-THRESHOLD: bb:
; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
; CHECK-SLP-THRESHOLD-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 1add732d32e85c..3bc6e64606e399 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
More information about the llvm-commits
mailing list