[llvm] a65a5fe - [SLP]Improve masked loads vectorization, attempting gathered loads
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 8 13:43:15 PDT 2024
Author: Alexey Bataev
Date: 2024-10-08T16:43:10-04:00
New Revision: a65a5feb1a20581c85ee817dae8826f65fef62af
URL: https://github.com/llvm/llvm-project/commit/a65a5feb1a20581c85ee817dae8826f65fef62af
DIFF: https://github.com/llvm/llvm-project/commit/a65a5feb1a20581c85ee817dae8826f65fef62af.diff
LOG: [SLP]Improve masked loads vectorization, attempting gathered loads
If the vector of loads can be vectorized as masked gather and there are
several other masked gather nodes, compiler can try to attempt to check,
if it possible to gather such nodes into big consecutive/strided loads
node, which provide better performance.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/110151
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7ced7a6d8eadbc..318a61208fe393 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1371,6 +1371,8 @@ class BoUpSLP {
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
+ LoadEntriesToVectorize.clear();
+ IsGraphTransformMode = false;
GatheredLoadsEntriesFirst.reset();
ExternalUses.clear();
ExternalUsesAsOriginalScalar.clear();
@@ -3613,6 +3615,14 @@ class BoUpSLP {
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
ValueToGatherNodesMap ValueToGatherNodes;
+ /// A list of the load entries (node indices), which can be vectorized using
+ /// strided or masked gather approach, but attempted to be represented as
+ /// contiguous loads.
+ SetVector<unsigned> LoadEntriesToVectorize;
+
+ /// true if graph nodes transforming mode is on.
+ bool IsGraphTransformMode = false;
+
/// The index of the first gathered load entry in the VectorizeTree.
std::optional<unsigned> GatheredLoadsEntriesFirst;
@@ -4618,17 +4628,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
return false;
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
- if (!GEP1)
- return false;
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
- if (!GEP2)
- return false;
- return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
- ((isConstant(GEP1->getOperand(1)) &&
- isConstant(GEP2->getOperand(1))) ||
+ return (!GEP1 || GEP1->getNumOperands() == 2) &&
+ (!GEP2 || GEP2->getNumOperands() == 2) &&
+ (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
+ (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
!CompareOpcodes ||
- getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
- .getOpcode());
+ (GEP1 && GEP2 &&
+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+ .getOpcode()));
}
/// Calculates minimal alignment as a common alignment.
@@ -5118,9 +5126,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
return L->isLoopInvariant(V);
})) <= Sz / 2;
- if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+ if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
- return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
+ return (!GEP && doesNotNeedToBeScheduled(P)) ||
(GEP && GEP->getNumOperands() == 2 &&
isa<Constant, Instruction>(GEP->getOperand(1)));
})) {
@@ -6667,6 +6675,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
GatheredLoadsEntriesFirst = VectorizableTree.size();
+ SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+ LoadEntriesToVectorize.size());
+ for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+ Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+ VectorizableTree[Idx]->Scalars.end());
+
// Sort loads by distance.
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
const std::pair<LoadInst *, int> &L2) {
@@ -6924,8 +6938,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
}
}
+ // Cannot represent the loads as consecutive vectorizable nodes -
+ // just exit.
+ unsigned ConsecutiveNodesSize = 0;
+ if (!LoadEntriesToVectorize.empty() &&
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ ArrayRef<Value *> VL =
+ VectorizableTree[std::get<0>(P)]->Scalars;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
+ continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
+ if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+ [&, Slice = Slice](unsigned Idx) {
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ return canVectorizeLoads(
+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
+ Slice[Idx * UserMaxVF], Order,
+ PointerOps) ==
+ LoadsState::ScatterVectorize;
+ }))
+ UserMaxVF = MaxVF;
+ if (Slice.size() != ConsecutiveNodesSize)
+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
}
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
bool IsVectorized = true;
@@ -6934,6 +6982,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
Slice.slice(I, std::min(VF, E - I));
if (getTreeEntry(SubSlice.front()))
continue;
+ // Check if the subslice is to be-vectorized entry, which is not
+ // equal to entry.
+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&](const auto &P) {
+ return !SubSlice.equals(
+ VectorizableTree[std::get<0>(P)]
+ ->Scalars) &&
+ set_is_subset(SubSlice, std::get<1>(P));
+ }))
+ continue;
unsigned Sz = VectorizableTree.size();
buildTree_rec(SubSlice, 0, EdgeInfo());
if (Sz == VectorizableTree.size()) {
@@ -6968,6 +7026,20 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Final attempt to vectorize non-vectorized loads.
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
}
+ // Try to vectorize postponed load entries, previously marked as gathered.
+ for (unsigned Idx : LoadEntriesToVectorize) {
+ const TreeEntry &E = *VectorizableTree[Idx];
+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
+ // Avoid reordering, if possible.
+ if (!E.ReorderIndices.empty()) {
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E.ReorderIndices, ReorderMask);
+ reorderScalars(GatheredScalars, ReorderMask);
+ }
+ buildTree_rec(GatheredScalars, 0, EdgeInfo());
+ }
// If no new entries created, consider it as no gathered loads entries must be
// handled.
if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
@@ -7280,6 +7352,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ // Delay slow vectorized nodes for better vectorization attempts.
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
+ return TreeEntry::NeedToGather;
+ }
return TreeEntry::ScatterVectorize;
case LoadsState::StridedVectorize:
return TreeEntry::StridedVectorize;
@@ -9117,6 +9194,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
+ // Turn graph transforming mode on and off, when done.
+ class GraphTransformModeRAAI {
+ bool &SavedIsGraphTransformMode;
+
+ public:
+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
+ IsGraphTransformMode = true;
+ }
+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+ } TransformContext(IsGraphTransformMode);
// Operands are profitable if they are:
// 1. At least one constant
// or
@@ -9149,7 +9237,7 @@ void BoUpSLP::transformNodes() {
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
- if (VL.size() <= 2 ||
+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL))
@@ -9248,6 +9336,8 @@ void BoUpSLP::transformNodes() {
continue;
}
unsigned PrevSize = VectorizableTree.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
if (PrevSize + 1 == VectorizableTree.size() &&
VectorizableTree[PrevSize]->isGather() &&
@@ -9255,6 +9345,8 @@ void BoUpSLP::transformNodes() {
Instruction::ExtractElement &&
!isSplat(Slice)) {
VectorizableTree.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
continue;
}
AddCombinedNode(PrevSize, Cnt);
@@ -9340,17 +9432,19 @@ void BoUpSLP::transformNodes() {
}
}
- // Single load node - exit.
- if (VectorizableTree.size() <= 1 &&
- VectorizableTree.front()->getOpcode() == Instruction::Load)
- return;
- // Small graph with small VF - exit.
- constexpr unsigned SmallTree = 3;
- constexpr unsigned SmallVF = 2;
- if ((VectorizableTree.size() <= SmallTree &&
- VectorizableTree.front()->Scalars.size() == SmallVF) ||
- (VectorizableTree.size() <= 2 && UserIgnoreList))
- return;
+ if (LoadEntriesToVectorize.empty()) {
+ // Single load node - exit.
+ if (VectorizableTree.size() <= 1 &&
+ VectorizableTree.front()->getOpcode() == Instruction::Load)
+ return;
+ // Small graph with small VF - exit.
+ constexpr unsigned SmallTree = 3;
+ constexpr unsigned SmallVF = 2;
+ if ((VectorizableTree.size() <= SmallTree &&
+ VectorizableTree.front()->Scalars.size() == SmallVF) ||
+ (VectorizableTree.size() <= 2 && UserIgnoreList))
+ return;
+ }
// A list of loads to be gathered during the vectorization process. We can
// try to vectorize them at the end, if profitable.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index b38c636ccaf5da..443f17a9c09e7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -58,83 +58,80 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT: [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
-; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]]
+; CHECK-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]]
; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
-; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
; CHECK-NEXT: [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
+; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
-; CHECK-NEXT: [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
-; CHECK-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
+; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
+; CHECK-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP76:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
+; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
-; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
-; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
-; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
+; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT: [[TMP64:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
+; CHECK-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
+; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT: [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
-; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
-; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
+; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
+; CHECK-NEXT: [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
@@ -151,39 +148,39 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT: [[ADD94_6:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP107]], [[TMP68]]
+; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
+; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
-; CHECK-NEXT: [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP131]], [[TMP100]]
+; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP97]], [[TMP121]]
-; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP133]]
+; CHECK-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]]
+; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP112]]
-; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP107]]
+; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
+; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
; CHECK-NEXT: [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
@@ -195,22 +192,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
; CHECK-NEXT: [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
; CHECK-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT: [[TMP145:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
; CHECK-NEXT: [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP145]], [[TMP120]]
+; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP123]], [[TMP134]]
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP157]]
+; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP133]]
+; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT: [[TMP158:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP158]]
+; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP191]]
; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -224,34 +221,34 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
-; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
-; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP146]], [[TMP98]]
-; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
-; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
-; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT: [[TMP167:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP166]], [[TMP167]]
-; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP141]], [[TMP193]]
+; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP192]], i32 1
+; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP145]], [[TMP144]]
+; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP145]], 15
+; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
+; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP147]], [[TMP146]]
+; CHECK-NEXT: [[TMP148:%.*]] = sub <2 x i32> [[TMP192]], [[TMP143]]
; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; CHECK-NEXT: [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
-; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
-; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP152]], [[TMP154]]
-; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
+; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP150:%.*]] = insertelement <2 x i32> [[TMP149]], i32 [[SUB45_1]], i32 0
+; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT: [[TMP152:%.*]] = sub <2 x i32> [[TMP150]], [[TMP151]]
+; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP147]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT: [[SHR_I54_2:%.*]] = lshr i32 [[SUB47_1]], 15
-; CHECK-NEXT: [[AND_I55_2:%.*]] = and i32 [[SHR_I54_2]], 65537
-; CHECK-NEXT: [[MUL_I56_2:%.*]] = mul i32 [[AND_I55_2]], 65535
-; CHECK-NEXT: [[TMP147:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
+; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
+; CHECK-NEXT: [[TMP194:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP154:%.*]] = and <2 x i32> [[TMP194]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP154]], <i32 65535, i32 65535>
; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
@@ -263,9 +260,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP166]]
-; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB106]]
-; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP146]]
+; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP147]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP145]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
@@ -275,11 +272,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
-; CHECK-NEXT: [[TMP203:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
-; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[TMP203]], [[TMP83]]
+; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
+; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
+; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
@@ -287,69 +284,69 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP169]], [[TMP160]]
-; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
-; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
-; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
-; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
-; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
-; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
-; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
-; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
-; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
+; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP148]], i32 0
+; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP148]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP196]], [[TMP157]]
+; CHECK-NEXT: [[TMP158:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB51_2]], i32 0
+; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP148]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP160:%.*]] = insertelement <2 x i32> [[TMP159]], i32 [[SUB51_3]], i32 0
+; CHECK-NEXT: [[TMP161:%.*]] = sub <2 x i32> [[TMP158]], [[TMP160]]
+; CHECK-NEXT: [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP161]], i32 0
+; CHECK-NEXT: [[TMP170:%.*]] = extractelement <2 x i32> [[TMP161]], i32 1
+; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
+; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT: [[TMP207:%.*]] = add <2 x i32> [[TMP149]], [[TMP227]]
-; CHECK-NEXT: [[TMP213:%.*]] = xor <2 x i32> [[TMP207]], [[TMP113]]
-; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP98]], 15
+; CHECK-NEXT: [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
+; CHECK-NEXT: [[TMP172:%.*]] = xor <2 x i32> [[TMP197]], [[TMP113]]
+; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP144]], 15
; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
+; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP144]]
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP176]]
-; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP177]]
+; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x i32> [[TMP172]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP173]]
+; CHECK-NEXT: [[TMP174:%.*]] = extractelement <2 x i32> [[TMP172]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP174]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
-; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
-; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
-; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
-; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
-; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0
-; CHECK-NEXT: [[TMP242:%.*]] = shufflevector <2 x i32> [[TMP241]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]]
-; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]]
-; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
+; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
+; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP175]], [[TMP176]]
+; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP175]]
+; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT: [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT: [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
-; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP150:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
-; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP150]], [[TMP102]]
+; CHECK-NEXT: [[TMP184:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]]
+; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP192]]
-; CHECK-NEXT: [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP193]]
+; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]]
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
@@ -361,9 +358,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; THR15-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; THR15-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; THR15-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
+; THR15-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
+; THR15-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
+; THR15-NEXT: [[ARRAYIDX15:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
@@ -374,8 +371,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; THR15-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; THR15-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; THR15-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
+; THR15-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
; THR15-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6
; THR15-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6
@@ -384,17 +381,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1
; THR15-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0
; THR15-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
-; THR15-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
+; THR15-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
; THR15-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3
-; THR15-NEXT: [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_2]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i16>
; THR15-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
-; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i16> [[TMP11]], [[TMP20]]
-; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i16> [[TMP40]], i32 1
-; THR15-NEXT: [[TMP16:%.*]] = sext i16 [[TMP46]] to i32
+; THR15-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
+; THR15-NEXT: [[TMP14:%.*]] = sub <2 x i16> [[TMP11]], [[TMP13]]
+; THR15-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP14]], i32 1
+; THR15-NEXT: [[TMP16:%.*]] = sext i16 [[TMP15]] to i32
; THR15-NEXT: [[SHL42_1:%.*]] = shl i32 [[TMP16]], 16
-; THR15-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP40]], i32 0
+; THR15-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP14]], i32 0
; THR15-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
; THR15-NEXT: [[ADD43_1:%.*]] = add i32 [[SHL42_1]], [[TMP18]]
; THR15-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
@@ -402,94 +399,93 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; THR15-NEXT: [[TMP19:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
; THR15-NEXT: [[TMP21:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP66]], [[TMP22]]
+; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]]
; THR15-NEXT: [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; THR15-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
-; THR15-NEXT: [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
-; THR15-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
+; THR15-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
+; THR15-NEXT: [[TMP26:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
+; THR15-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]]
+; THR15-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]]
; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
; THR15-NEXT: [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
+; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
; THR15-NEXT: [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
+; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
; THR15-NEXT: [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
; THR15-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP53]], [[TMP39]]
-; THR15-NEXT: [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; THR15-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP59]], [[TMP68]]
-; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP59]]
-; THR15-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
-; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP60]], [[TMP76]]
-; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP76]], [[TMP60]]
+; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]]
+; THR15-NEXT: [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]]
+; THR15-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
+; THR15-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
+; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]]
+; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP43]], [[TMP44]]
+; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0
+; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1
+; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]]
+; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP46]]
; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
+; THR15-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; THR15-NEXT: [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; THR15-NEXT: [[TMP63:%.*]] = zext i8 [[TMP48]] to i32
-; THR15-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP49]], [[TMP61]]
-; THR15-NEXT: [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
-; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP54]], [[TMP80]]
-; THR15-NEXT: [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
-; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; THR15-NEXT: [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT: [[TMP83:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; THR15-NEXT: [[TMP87:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP87]] to <2 x i32>
-; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP83]], [[TMP98]]
-; THR15-NEXT: [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; THR15-NEXT: [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT: [[TMP70:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
-; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; THR15-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
-; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT: [[TMP102:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP102]], [[TMP104]]
-; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP102]]
-; THR15-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP78]], [[TMP107]]
-; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP107]], [[TMP78]]
-; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT: [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT: [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; THR15-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; THR15-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
+; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; THR15-NEXT: [[TMP55:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP54]], [[TMP56]]
+; THR15-NEXT: [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]]
+; THR15-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; THR15-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; THR15-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; THR15-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
+; THR15-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
+; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP59]]
+; THR15-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP59]], [[TMP71]]
+; THR15-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; THR15-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
+; THR15-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[ADD44_2]], i32 1
+; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[ADD46_2]], i32 1
+; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP76]], [[TMP78]]
+; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0
+; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1
+; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
+; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP73]], i32 [[SUB45_2]], i32 1
+; THR15-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP83]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15
+; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15
; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
@@ -497,113 +493,117 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
-; THR15-NEXT: [[TMP105:%.*]] = extractelement <2 x i32> [[TMP66]], i32 1
-; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP105]], 15
+; THR15-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1
+; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15
; THR15-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
; THR15-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; THR15-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[TMP66]], i32 0
-; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
+; THR15-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
+; THR15-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15
+; THR15-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; THR15-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; THR15-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
-; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
+; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; THR15-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
-; THR15-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
-; THR15-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
-; THR15-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; THR15-NEXT: [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
-; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
-; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP125:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
-; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
-; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP125]]
-; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
-; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP108]]
-; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
-; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
-; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
-; THR15-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
-; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0
-; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
-; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
-; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT: [[TMP110:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
-; THR15-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP110]]
-; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
+; THR15-NEXT: [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT: [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32>
+; THR15-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT: [[TMP96:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; THR15-NEXT: [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]]
+; THR15-NEXT: [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP102:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
+; THR15-NEXT: [[TMP104:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
+; THR15-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; THR15-NEXT: [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]]
+; THR15-NEXT: [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1
+; THR15-NEXT: [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]]
+; THR15-NEXT: [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]]
+; THR15-NEXT: [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0
+; THR15-NEXT: [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]]
+; THR15-NEXT: [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]]
+; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> <i32 0, i32 2>
+; THR15-NEXT: [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]]
+; THR15-NEXT: [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]]
+; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0
+; THR15-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1
+; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]]
+; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]]
+; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0
+; THR15-NEXT: [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1
+; THR15-NEXT: [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]]
+; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15
; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
-; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
+; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15
; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT: [[TMP112:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
-; THR15-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; THR15-NEXT: [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
-; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP115]]
-; THR15-NEXT: [[TMP121:%.*]] = add <2 x i32> [[TMP114]], [[TMP134]]
-; THR15-NEXT: [[TMP145:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP145]], i8 [[TMP3]], i32 1
+; THR15-NEXT: [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32>
+; THR15-NEXT: [[TMP125:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
+; THR15-NEXT: [[TMP127:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; THR15-NEXT: [[TMP146:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP9]], i32 0
-; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
-; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
-; THR15-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP132]]
-; THR15-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
-; THR15-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP147]]
-; THR15-NEXT: [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT: [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
-; THR15-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
-; THR15-NEXT: [[TMP138:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
+; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT: [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]]
+; THR15-NEXT: [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], <i32 16, i32 16>
+; THR15-NEXT: [[TMP133:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT: [[TMP134:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP126]]
+; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP132]], [[TMP135]]
+; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP138:%.*]] = insertelement <2 x i8> [[TMP137]], i8 [[TMP3]], i32 1
+; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32>
+; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[TMP9]], i32 0
+; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]]
+; THR15-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP142]], 16
+; THR15-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP143]]
+; THR15-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0
+; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1
+; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]]
+; THR15-NEXT: [[TMP146:%.*]] = shufflevector <2 x i32> [[TMP136]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP147:%.*]] = insertelement <2 x i32> [[TMP146]], i32 [[ADD43_1]], i32 1
+; THR15-NEXT: [[TMP148:%.*]] = insertelement <2 x i32> [[TMP136]], i32 [[ADD31_1]], i32 1
+; THR15-NEXT: [[TMP149:%.*]] = add <2 x i32> [[TMP147]], [[TMP148]]
; THR15-NEXT: [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; THR15-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
-; THR15-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP138]], i32 1
-; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP139]]
-; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
+; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
+; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]]
+; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]]
; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; THR15-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP91]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT: [[TMP142:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB45_1]], i32 0
-; THR15-NEXT: [[TMP143:%.*]] = insertelement <2 x i32> [[TMP91]], i32 [[SUB47_1]], i32 0
-; THR15-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP143]]
-; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP140]], 15
+; THR15-NEXT: [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT: [[TMP153:%.*]] = insertelement <2 x i32> [[TMP152]], i32 [[SUB45_1]], i32 0
+; THR15-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP118]], i32 [[SUB47_1]], i32 0
+; THR15-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP153]], [[TMP154]]
+; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP151]], 15
+; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
+; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
+; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; THR15-NEXT: [[SHR_I54_2:%.*]] = lshr i32 [[SUB47_1]], 15
-; THR15-NEXT: [[AND_I55_2:%.*]] = and i32 [[SHR_I54_2]], 65537
-; THR15-NEXT: [[MUL_I56_2:%.*]] = mul i32 [[AND_I55_2]], 65535
-; THR15-NEXT: [[TMP122:%.*]] = lshr <2 x i32> [[TMP130]], <i32 15, i32 15>
-; THR15-NEXT: [[TMP123:%.*]] = and <2 x i32> [[TMP122]], <i32 65537, i32 65537>
-; THR15-NEXT: [[TMP124:%.*]] = mul <2 x i32> [[TMP123]], <i32 65535, i32 65535>
+; THR15-NEXT: [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], <i32 15, i32 15>
+; THR15-NEXT: [[TMP157:%.*]] = and <2 x i32> [[TMP156]], <i32 65537, i32 65537>
+; THR15-NEXT: [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], <i32 65535, i32 65535>
; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
; THR15-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
@@ -611,13 +611,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
; THR15-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
+; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]]
; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
-; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
-; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP140]]
+; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
+; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP151]]
; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP90]]
+; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]]
; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
@@ -628,73 +628,73 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
; THR15-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
-; THR15-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP105]]
-; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_1]]
-; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP64]]
-; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
+; THR15-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]]
+; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]]
+; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
-; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
+; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]]
; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]]
-; THR15-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; THR15-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; THR15-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP154]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP153]], [[TMP155]]
-; THR15-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP153]], [[TMP155]]
-; THR15-NEXT: [[TMP158:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP157]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT: [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; THR15-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; THR15-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> <i32 0, i32 3>
; THR15-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]]
; THR15-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]]
-; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_2]]
+; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
; THR15-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; THR15-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP124]], [[TMP158]]
-; THR15-NEXT: [[TMP160:%.*]] = xor <2 x i32> [[TMP177]], [[TMP130]]
-; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP89]], 15
+; THR15-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]]
+; THR15-NEXT: [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]]
+; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15
; THR15-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
; THR15-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
+; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]]
; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
-; THR15-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
-; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
+; THR15-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0
+; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]]
+; THR15-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1
+; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]]
; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
-; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP176]]
-; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP159]]
-; THR15-NEXT: [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
-; THR15-NEXT: [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
-; THR15-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP167:%.*]] = add <2 x i32> [[TMP164]], [[TMP166]]
-; THR15-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP166]]
-; THR15-NEXT: [[TMP169:%.*]] = shufflevector <2 x i32> [[TMP167]], <2 x i32> [[TMP168]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT: [[TMP170:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
+; THR15-NEXT: [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
+; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP170]], [[TMP171]]
+; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP171]], [[TMP170]]
+; THR15-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; THR15-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; THR15-NEXT: [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
; THR15-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
; THR15-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
-; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
+; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
; THR15-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
-; THR15-NEXT: [[TMP170:%.*]] = lshr <2 x i32> [[TMP74]], <i32 15, i32 15>
-; THR15-NEXT: [[TMP171:%.*]] = and <2 x i32> [[TMP170]], <i32 65537, i32 65537>
-; THR15-NEXT: [[TMP172:%.*]] = mul <2 x i32> [[TMP171]], <i32 65535, i32 65535>
-; THR15-NEXT: [[TMP173:%.*]] = add <2 x i32> [[TMP172]], [[TMP169]]
-; THR15-NEXT: [[TMP174:%.*]] = xor <2 x i32> [[TMP173]], [[TMP74]]
+; THR15-NEXT: [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], <i32 15, i32 15>
+; THR15-NEXT: [[TMP180:%.*]] = and <2 x i32> [[TMP179]], <i32 65537, i32 65537>
+; THR15-NEXT: [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], <i32 65535, i32 65535>
+; THR15-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]]
+; THR15-NEXT: [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]]
; THR15-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
; THR15-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
; THR15-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
-; THR15-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
-; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP178]]
+; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0
+; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
+; THR15-NEXT: [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1
+; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]]
; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; THR15-NEXT: ret i32 [[ADD113_3]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bb806be15c71ca..09612444afd205 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,20 +8,17 @@
; YAML-NEXT: Function: test
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
-; YAML-NEXT: - Cost: '2'
+; YAML-NEXT: - Cost: '0'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '7'
+; YAML-NEXT: - TreeSize: '9'
define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..9c1da08c64b7b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3ac..98333c7b420cf0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
; CHECK-LABEL: define <4 x i32> @test(
; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22b..59b0352a825929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - String: ' and with tree size '
- ; YAML-NEXT: - TreeSize: '7'
+ ; YAML-NEXT: - TreeSize: '8'
entry:
%off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
%idx0 = load i32, ptr %off0.1, align 8
More information about the llvm-commits
mailing list