[llvm] 6a3a5ca - Revert "[SLP]Add support for strided loads."
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 12 08:47:40 PST 2024
Author: Alexey Bataev
Date: 2024-02-12T08:47:28-08:00
New Revision: 6a3a5cad2e1249f1b685546ebe71b2ead9a27541
URL: https://github.com/llvm/llvm-project/commit/6a3a5cad2e1249f1b685546ebe71b2ead9a27541
DIFF: https://github.com/llvm/llvm-project/commit/6a3a5cad2e1249f1b685546ebe71b2ead9a27541.diff
LOG: Revert "[SLP]Add support for strided loads."
This reverts commit 0940f9083e68bda78bcbb323c2968a4294092e21 to fix
issues reported in https://github.com/llvm/llvm-project/pull/80310.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c94fb71ab220ba..c0b7298f78005d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,15 +175,6 @@ static cl::opt<int> RootLookAheadMaxDepth(
"slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for searching best rooting option"));
-static cl::opt<unsigned> MinProfitableStridedLoads(
- "slp-min-strided-loads", cl::init(2), cl::Hidden,
- cl::desc("The minimum number of loads, which should be considered strided, "
- "if the stride is > 1 or is runtime value"));
-
-static cl::opt<unsigned> MaxProfitableLoadStride(
- "slp-max-stride", cl::init(8), cl::Hidden,
- cl::desc("The maximum stride, considered to be profitable."));
-
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
@@ -2584,7 +2575,7 @@ class BoUpSLP {
enum EntryState {
Vectorize,
ScatterVectorize,
- StridedVectorize,
+ PossibleStridedVectorize,
NeedToGather
};
EntryState State;
@@ -2762,8 +2753,8 @@ class BoUpSLP {
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
- case StridedVectorize:
- dbgs() << "StridedVectorize\n";
+ case PossibleStridedVectorize:
+ dbgs() << "PossibleStridedVectorize\n";
break;
case NeedToGather:
dbgs() << "NeedToGather\n";
@@ -3689,7 +3680,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::StridedVectorize)
+ Entry->State == TreeEntry::PossibleStridedVectorize)
return "color=blue";
return "";
}
@@ -3851,7 +3842,12 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
namespace {
/// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
+enum class LoadsState {
+ Gather,
+ Vectorize,
+ ScatterVectorize,
+ PossibleStridedVectorize
+};
} // anonymous namespace
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3882,14 +3878,6 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
return CommonAlignment;
}
-/// Check if \p Order represents reverse order.
-static bool isReverseOrder(ArrayRef<unsigned> Order) {
- unsigned Sz = Order.size();
- return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
- return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
- });
-}
-
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3912,8 +3900,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
PointerOps.clear();
- const unsigned Sz = VL.size();
- PointerOps.resize(Sz);
+ PointerOps.resize(VL.size());
auto *POIter = PointerOps.begin();
for (Value *V : VL) {
auto *L = cast<LoadInst>(V);
@@ -3924,12 +3911,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
Order.clear();
- auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
+ bool IsPossibleStrided = false;
if (IsSorted) {
Value *Ptr0;
Value *PtrN;
@@ -3943,71 +3930,30 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
std::optional<int> Diff =
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
// Check that the sorted loads are consecutive.
- if (static_cast<unsigned>(*Diff) == Sz - 1)
+ if (static_cast<unsigned>(*Diff) == VL.size() - 1)
return LoadsState::Vectorize;
// Simple check if not a strided access - clear order.
- bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
- // Try to generate strided load node if:
- // 1. Target with strided load support is detected.
- // 2. The number of loads is greater than MinProfitableStridedLoads,
- // or the potential stride <= MaxProfitableLoadStride and the
- // potential stride is power-of-2 (to avoid perf regressions for the very
- // small number of loads) and max distance > number of loads, or potential
- // stride is -1.
- // 3. The loads are ordered, or number of unordered loads <=
- // MaxProfitableUnorderedLoads, or loads are in reversed order.
- // (this check is to avoid extra costs for very expensive shuffles).
- if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
- (static_cast<unsigned>(std::abs(*Diff)) <=
- MaxProfitableLoadStride * Sz &&
- isPowerOf2_32(std::abs(*Diff)))) &&
- static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
- *Diff == -(static_cast<int>(Sz) - 1))) {
- int Stride = *Diff / static_cast<int>(Sz - 1);
- if (*Diff == Stride * static_cast<int>(Sz - 1)) {
- Align Alignment =
- cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
- ->getAlign();
- if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int Dist = 0;
- if (Ptr == PtrN)
- Dist = *Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist ||
- !Dists.insert(Dist).second)
- break;
- }
- if (Dists.size() == Sz)
- return LoadsState::StridedVectorize;
- }
- }
- }
+ IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
}
// TODO: need to improve analysis of the pointers, if not all of them are
// GEPs or have > 2 operands, we end up with a gather node, which just
// increases the cost.
Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
bool ProfitableGatherPointers =
- static_cast<unsigned>(count_if(
- PointerOps,
- [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
- Sz > 2;
+ static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+ return L && L->isLoopInvariant(V);
+ })) <= VL.size() / 2 && VL.size() > 2;
if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
(GEP && GEP->getNumOperands() == 2);
})) {
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::ScatterVectorize;
+ return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
+ : LoadsState::ScatterVectorize;
}
}
@@ -4214,7 +4160,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::move(ResOrder);
}
if ((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
+ TE.State == TreeEntry::PossibleStridedVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
!TE.isAltShuffle())
@@ -4472,7 +4418,7 @@ void BoUpSLP::reorderTopToBottom() {
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -4496,6 +4442,9 @@ void BoUpSLP::reorderTopToBottom() {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<OrdersType> StridedVectorizeOrders;
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4542,6 +4491,11 @@ void BoUpSLP::reorderTopToBottom() {
if (Order.empty())
continue;
}
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.push_back(Order);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4558,6 +4512,22 @@ void BoUpSLP::reorderTopToBottom() {
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
}
}
+ // Set order of the user node.
+ if (OrdersUses.empty()) {
+ if (StridedVectorizeOrders.empty())
+ continue;
+ // Add (potentially!) strided vectorize orders.
+ for (OrdersType &Order : StridedVectorizeOrders)
+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (OrdersType &Order : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Order);
+ if (It != OrdersUses.end())
+ ++It->second;
+ }
+ }
// Choose the most used order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4599,7 +4569,7 @@ void BoUpSLP::reorderTopToBottom() {
continue;
}
if ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
+ TE->State == TreeEntry::PossibleStridedVectorize) &&
isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
@@ -4640,6 +4610,10 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ // FIXME: Do not reorder (possible!) strided vectorized nodes, they
+ // require reordering of the operands, which is not implemented yet.
+ if (TE->State == TreeEntry::PossibleStridedVectorize)
+ return false;
// Do not reorder if operand node is used by many user nodes.
if (any_of(TE->UserTreeIndices,
[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4690,13 +4664,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> NonVectorized;
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::StridedVectorize)
+ TE->State != TreeEntry::PossibleStridedVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) ||
+ TE->State == TreeEntry::PossibleStridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}
@@ -4714,7 +4688,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::PossibleStridedVectorize ||
(TE->State == TreeEntry::NeedToGather &&
GathersToOrders.count(TE))) ||
TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4759,6 +4733,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
MapVector<OrdersType, unsigned,
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
OrdersUses;
+ // Last chance orders - scatter vectorize. Try to use their orders if no
+ // other orders or the order is counted already.
+ SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
// Do the analysis for each tree entry only once, otherwise the order of
// the same node my be considered several times, though might be not
// profitable.
@@ -4780,6 +4757,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
return P.second == OpTE;
});
+ // Postpone scatter orders.
+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+ StridedVectorizeOrders.emplace_back(Order, NumOps);
+ continue;
+ }
// Stores actually store the mask, not the order, need to invert.
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4837,6 +4819,30 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
++Res.first->second;
}
}
+ // If no orders - skip current nodes and jump to the next one, if any.
+ if (OrdersUses.empty()) {
+ if (StridedVectorizeOrders.empty() ||
+ (Data.first->ReorderIndices.empty() &&
+ Data.first->ReuseShuffleIndices.empty() &&
+ !(IgnoreReorder &&
+ Data.first == VectorizableTree.front().get()))) {
+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+ OrderedEntries.remove(Op.second);
+ continue;
+ }
+ // Add (potentially!) strided vectorize orders.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
+ OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
+ Pair.second;
+ } else {
+ // Account (potentially!) strided vectorize orders only if it was used
+ // already.
+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
+ auto *It = OrdersUses.find(Pair.first);
+ if (It != OrdersUses.end())
+ It->second += Pair.second;
+ }
+ }
// Choose the best order.
ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
unsigned Cnt = OrdersUses.front().second;
@@ -4872,7 +4878,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::PossibleStridedVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
continue;
@@ -4904,7 +4910,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::StridedVectorize) {
+ Data.first->State == TreeEntry::PossibleStridedVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -4967,6 +4973,7 @@ void BoUpSLP::buildExternalUses(
// instructions. If that is the case, the one in FoundLane will
// be used.
if (UseEntry->State == TreeEntry::ScatterVectorize ||
+ UseEntry->State == TreeEntry::PossibleStridedVectorize ||
!doesInTreeUserNeedToExtract(
Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5324,8 +5331,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
return TreeEntry::ScatterVectorize;
- case LoadsState::StridedVectorize:
- return TreeEntry::StridedVectorize;
+ case LoadsState::PossibleStridedVectorize:
+ return TreeEntry::PossibleStridedVectorize;
case LoadsState::Gather:
#ifndef NDEBUG
Type *ScalarTy = VL0->getType();
@@ -5746,7 +5753,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BasicBlock *BB = nullptr;
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
bool AreAllSameInsts =
(S.getOpcode() && allSameBlock(VL)) ||
(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5843,7 +5851,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Special processing for sorted pointers for ScatterVectorize node with
// constant indeces only.
if (AreAllSameInsts && UserTreeIdx.UserTE &&
- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
!(S.getOpcode() && allSameBlock(VL))) {
assert(S.OpValue->getType()->isPointerTy() &&
count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6040,17 +6049,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TE->setOperandsInOrder();
break;
- case TreeEntry::StridedVectorize:
+ case TreeEntry::PossibleStridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
if (CurrentOrder.empty()) {
- TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies);
} else {
- TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
}
TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7081,7 +7091,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
!isSplat(Gathers)) {
InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
SetVector<Value *> VectorizedLoads;
- SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
+ SmallVector<unsigned> VectorizedStarts;
SmallVector<unsigned> ScatterVectorized;
unsigned StartIdx = 0;
unsigned VF = VL.size() / 2;
@@ -7105,16 +7115,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
- case LoadsState::StridedVectorize:
+ case LoadsState::PossibleStridedVectorize:
// Mark the vectorized loads so that we don't vectorize them
// again.
// TODO: better handling of loads with reorders.
- if (((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
- CurrentOrder.empty()) ||
- (LS == LoadsState::StridedVectorize &&
- isReverseOrder(CurrentOrder)))
- VectorizedStarts.emplace_back(Cnt, LS);
+ if (LS == LoadsState::Vectorize && CurrentOrder.empty())
+ VectorizedStarts.push_back(Cnt);
else
ScatterVectorized.push_back(Cnt);
VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7158,20 +7164,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
CostKind, TTI::OperandValueInfo(), LI);
}
auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
- for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
- auto *LI = cast<LoadInst>(VL[P.first]);
+ for (unsigned P : VectorizedStarts) {
+ auto *LI = cast<LoadInst>(VL[P]);
Align Alignment = LI->getAlign();
GatherCost +=
- P.second == LoadsState::Vectorize
- ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI)
- : TTI.getStridedMemoryOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
// Estimate GEP cost.
SmallVector<Value *> PointerOps(VF);
- for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
+ for (auto [I, V] : enumerate(VL.slice(P, VF)))
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7911,9 +7913,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
- (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
+ E->State != TreeEntry::PossibleStridedVectorize) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -7931,7 +7932,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -7951,8 +7952,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
auto GetCastContextHint = [&](Value *V) {
if (const TreeEntry *OpTE = getTreeEntry(V)) {
- if (OpTE->State == TreeEntry::ScatterVectorize ||
- OpTE->State == TreeEntry::StridedVectorize)
+ if (OpTE->State == TreeEntry::ScatterVectorize)
return TTI::CastContextHint::GatherScatter;
if (OpTE->State == TreeEntry::Vectorize &&
OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,9 +8028,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Calculate cost
diff erence from vectorizing set of GEPs.
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
- assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::StridedVectorize) &&
- "Entry state expected to be Vectorize or StridedVectorize here.");
+ assert(E->State == TreeEntry::Vectorize &&
+ "Entry state expected to be Vectorize here.");
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8383,14 +8382,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VecLdCost = TTI->getMemoryOpCost(
Instruction::Load, VecTy, LI0->getAlign(),
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
- } else if (E->State == TreeEntry::StridedVectorize) {
- Align CommonAlignment =
- computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
- VecLdCost = TTI->getStridedMemoryOpCost(
- Instruction::Load, VecTy, LI0->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind);
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unknown EntryState");
Align CommonAlignment =
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8403,7 +8398,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
// If this node generates masked gather load then it is not a terminal node.
// Hence address operand cost is estimated separately.
- if (E->State == TreeEntry::ScatterVectorize)
+ if (E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize)
return Cost;
// Estimate cost of GEPs since this tree node is a terminator.
@@ -8612,7 +8608,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
- VectorizableTree[0]->State != TreeEntry::StridedVectorize))
+ VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
return false;
return true;
@@ -10583,6 +10579,11 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
bool PostponedPHIs) {
ValueList &VL = E->getOperand(NodeIdx);
+ if (E->State == TreeEntry::PossibleStridedVectorize &&
+ !E->ReorderIndices.empty()) {
+ SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
+ reorderScalars(VL, Mask);
+ }
const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
@@ -11156,7 +11157,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return Vec;
}
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
bool IsSigned) {
if (V->getType() != VecTy)
@@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
E->ReorderIndices.size());
ShuffleBuilder.add(V, Mask);
- } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
+ } else if (E->State == TreeEntry::PossibleStridedVectorize) {
ShuffleBuilder.addOrdered(V, std::nullopt);
} else {
ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,29 +11642,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
- } else if (E->State == TreeEntry::StridedVectorize) {
- Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
- Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
- PO = IsReverseOrder ? PtrN : Ptr0;
- std::optional<int> Diff = getPointersDiff(
- VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
- Type *StrideTy = DL->getIndexType(PO->getType());
- int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
- Value *StrideVal =
- ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
- DL->getTypeAllocSize(ScalarTy));
- Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
- auto *Inst = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_load,
- {VecTy, PO->getType(), StrideTy},
- {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
- Builder.getInt32(E->Scalars.size())});
- Inst->addParamAttr(
- /*ArgNo=*/0,
- Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
- NewLI = Inst;
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+ assert((E->State == TreeEntry::ScatterVectorize ||
+ E->State == TreeEntry::PossibleStridedVectorize) &&
+ "Unhandled state");
Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12088,11 +12069,8 @@ Value *BoUpSLP::vectorizeTree(
[&](llvm::User *U) {
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
- (UseEntry->State == TreeEntry::Vectorize ||
- UseEntry->State ==
- TreeEntry::StridedVectorize) &&
- (E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ UseEntry->State == TreeEntry::Vectorize &&
+ E->State == TreeEntry::Vectorize &&
doesInTreeUserNeedToExtract(
Scalar,
cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index e167b6a47af592..dc5fb917886347 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
; CHECK-LABEL: define i32 @test(
@@ -67,303 +67,305 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT: [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
-; CHECK-NEXT: [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT: [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
-; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
-; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT: [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
-; CHECK-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
-; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
-; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
-; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
-; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
-; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
-; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
-; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
-; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
+; CHECK-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; CHECK-NEXT: [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
+; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; CHECK-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
+; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
+; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
+; CHECK-NEXT: [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
+; CHECK-NEXT: [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
+; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
+; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
+; CHECK-NEXT: [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
+; CHECK-NEXT: [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
+; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
+; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
+; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
+; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
+; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
-; CHECK-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
-; CHECK-NEXT: [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT: [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
-; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
-; CHECK-NEXT: [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
-; CHECK-NEXT: [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
-; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
-; CHECK-NEXT: [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
-; CHECK-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
-; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
-; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
-; CHECK-NEXT: [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
-; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
-; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
-; CHECK-NEXT: [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
-; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
-; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
-; CHECK-NEXT: [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT: [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
+; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
+; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
+; CHECK-NEXT: [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
+; CHECK-NEXT: [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT: [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
+; CHECK-NEXT: [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
+; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
+; CHECK-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
+; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
+; CHECK-NEXT: [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
+; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
+; CHECK-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
+; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
+; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
+; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
+; CHECK-NEXT: [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
+; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
+; CHECK-NEXT: [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
+; CHECK-NEXT: [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
+; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
+; CHECK-NEXT: [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT: [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT: [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT: [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
-; CHECK-NEXT: [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
-; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
-; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
-; CHECK-NEXT: [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
-; CHECK-NEXT: [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
-; CHECK-NEXT: [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
-; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
-; CHECK-NEXT: [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
-; CHECK-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
-; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
-; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
-; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
-; CHECK-NEXT: [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
-; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
-; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
-; CHECK-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
-; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
+; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT: [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT: [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
+; CHECK-NEXT: [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
+; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
+; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
+; CHECK-NEXT: [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT: [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
+; CHECK-NEXT: [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
+; CHECK-NEXT: [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
+; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
+; CHECK-NEXT: [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT: [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
+; CHECK-NEXT: [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
+; CHECK-NEXT: [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
+; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
+; CHECK-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
+; CHECK-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
+; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
+; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
+; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
+; CHECK-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
+; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
+; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT: [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
-; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
-; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT: [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT: [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
-; CHECK-NEXT: [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT: [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
-; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT: [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
-; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
-; CHECK-NEXT: [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
-; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
-; CHECK-NEXT: [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
-; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
-; CHECK-NEXT: [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
+; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
+; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
+; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
+; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT: [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT: [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
+; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT: [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT: [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT: [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT: [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
+; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT: [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
+; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
+; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
+; CHECK-NEXT: [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
+; CHECK-NEXT: [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT: [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
+; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
+; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
+; CHECK-NEXT: [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT: [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
-; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
+; CHECK-NEXT: [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
+; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
+; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
+; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
-; CHECK-NEXT: [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT: [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT: [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
-; CHECK-NEXT: [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
-; CHECK-NEXT: [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
-; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
-; CHECK-NEXT: [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT: [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT: [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT: [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
-; CHECK-NEXT: [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
-; CHECK-NEXT: [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
-; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
-; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
-; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
-; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
-; CHECK-NEXT: [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT: [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
-; CHECK-NEXT: [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
-; CHECK-NEXT: [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
-; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
-; CHECK-NEXT: [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT: [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
-; CHECK-NEXT: [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
-; CHECK-NEXT: [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
-; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
-; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
+; CHECK-NEXT: [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
+; CHECK-NEXT: [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT: [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
+; CHECK-NEXT: [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
+; CHECK-NEXT: [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
+; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
+; CHECK-NEXT: [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT: [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT: [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
+; CHECK-NEXT: [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
+; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
+; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
+; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
+; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
+; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
+; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
+; CHECK-NEXT: [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
+; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
+; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
+; CHECK-NEXT: [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT: [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT: [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
+; CHECK-NEXT: [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
+; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
+; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
+; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
-; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
-; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
+; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
+; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
+; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
+; CHECK-NEXT: [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT: [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT: [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
-; CHECK-NEXT: [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
-; CHECK-NEXT: [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
-; CHECK-NEXT: [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
-; CHECK-NEXT: [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
-; CHECK-NEXT: [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
-; CHECK-NEXT: [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
-; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
-; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
-; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
-; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
+; CHECK-NEXT: [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT: [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT: [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
+; CHECK-NEXT: [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
+; CHECK-NEXT: [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
+; CHECK-NEXT: [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
+; CHECK-NEXT: [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
+; CHECK-NEXT: [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
+; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
+; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
+; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
+; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
+; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
+; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 4b0b41970bbb4d..a4cc311d12a217 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,12 +5,61 @@ define void @test(ptr %p, ptr noalias %s) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
+; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
+; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
+; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
+; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -213,40 +262,67 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
+; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
+; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
-; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -321,12 +397,27 @@ define void @test3(ptr %p, ptr noalias %s) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
-; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
+; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
+; CHECK-NEXT: store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index ec152c707eec6b..5aba9ea115a4b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]]
; CHECK: while.body.i:
; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
; CHECK-NEXT: br label [[WHILE_BODY_I]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8ab57cc73e646f..8f2c72bb4c6856 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,11 +5,14 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
; CHECK-LABEL: define i32 @sum_of_abs
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
-; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT: ret i32 [[TMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
+; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
+; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 9e43cefef2801d..96d4c307f1c67f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
; CHECK-SLP-THRESHOLD: bb:
; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
+; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
; CHECK-SLP-THRESHOLD-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 3bc6e64606e399..1add732d32e85c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
More information about the llvm-commits
mailing list