[llvm] [SLP]Improve masked loads vectorization, attempting gathered loads (PR #110151)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 26 11:10:06 PDT 2024
https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/110151
If the vector of loads can be vectorized as masked gather and there are
several other masked gather nodes, compiler can try to attempt to check,
if it possible to gather such nodes into big consecutive/strided loads
node, which provide better performance.
>From e4768e045c2f56ccfdf0cbeeaacb0cacd754fb22 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 26 Sep 2024 18:09:51 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 142 +++++--
.../SLPVectorizer/RISCV/complex-loads.ll | 358 +++++++++---------
.../RISCV/remarks-insert-into-small-vector.ll | 11 +-
...reversed-strided-node-with-external-ptr.ll | 2 +-
.../RISCV/scatter-vectorize-reversed.ll | 6 +-
.../X86/remark_gather-load-redux-cost.ll | 2 +-
6 files changed, 306 insertions(+), 215 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53d7ae606ffeea..62c77704d92eb5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1368,6 +1368,8 @@ class BoUpSLP {
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
+ LoadEntriesToVectorize.clear();
+ IsGraphTransformMode = false;
GatheredLoadsEntriesFirst = NoGatheredLoads;
ExternalUses.clear();
ExternalUsesAsOriginalScalar.clear();
@@ -3610,6 +3612,13 @@ class BoUpSLP {
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
ValueToGatherNodesMap ValueToGatherNodes;
+ /// A list of the loads, which can be vectorized using strided or masked
+ /// gather approach, but attempted to be represented as contiguous loads.
+ SetVector<unsigned> LoadEntriesToVectorize;
+
+ /// true if graph nodes transforming mode is on.
+ bool IsGraphTransformMode = false;
+
/// The index of the first gathered load entry in the VectorizeTree.
constexpr static int NoGatheredLoads = -1;
int GatheredLoadsEntriesFirst = NoGatheredLoads;
@@ -4612,17 +4621,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
return false;
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
- if (!GEP1)
- return false;
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
- if (!GEP2)
- return false;
- return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
- ((isConstant(GEP1->getOperand(1)) &&
- isConstant(GEP2->getOperand(1))) ||
+ return (!GEP1 || GEP1->getNumOperands() == 2) &&
+ (!GEP2 || GEP2->getNumOperands() == 2) &&
+ (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
+ (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
!CompareOpcodes ||
- getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
- .getOpcode());
+ (GEP1 && GEP2 &&
+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+ .getOpcode()));
}
/// Calculates minimal alignment as a common alignment.
@@ -5112,10 +5119,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
return L->isLoopInvariant(V);
})) <= Sz / 2;
- if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+ if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
- return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
- (GEP && GEP->getNumOperands() == 2 &&
+ return (!GEP && doesNotNeedToBeScheduled(P)) ||
+ (GEP->getNumOperands() == 2 &&
isa<Constant, Instruction>(GEP->getOperand(1)));
})) {
// Check if potential masked gather can be represented as series
@@ -6607,6 +6614,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
GatheredLoadsEntriesFirst = VectorizableTree.size();
+ SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+ LoadEntriesToVectorize.size());
+ for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+ Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+ VectorizableTree[Idx]->Scalars.end());
+
// Sort loads by distance.
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
const std::pair<LoadInst *, int> &L2) {
@@ -6864,8 +6877,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
}
}
+ // Cannot represent the loads as consecutive vectorizable nodes -
+ // just exit.
+ unsigned ConsecutiveNodesSize = 0;
+ if (!LoadEntriesToVectorize.empty() &&
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ ArrayRef<Value *> VL =
+ VectorizableTree[std::get<0>(P)]->Scalars;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
+ continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
+ if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+ [&, Slice = Slice](unsigned Idx) {
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ return canVectorizeLoads(
+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
+ Slice[Idx * UserMaxVF], Order,
+ PointerOps) ==
+ LoadsState::ScatterVectorize;
+ }))
+ UserMaxVF = MaxVF;
+ if (Slice.size() != ConsecutiveNodesSize)
+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
}
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
bool IsVectorized = true;
@@ -6874,6 +6921,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
Slice.slice(I, std::min(VF, E - I));
if (getTreeEntry(SubSlice.front()))
continue;
+ // Check if the subslice is to be-vectorized entry, which is not
+ // equal to entry.
+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&](const auto &P) {
+ return !SubSlice.equals(
+ VectorizableTree[std::get<0>(P)]
+ ->Scalars) &&
+ set_is_subset(SubSlice, std::get<1>(P));
+ }))
+ continue;
unsigned Sz = VectorizableTree.size();
buildTree_rec(SubSlice, 0, EdgeInfo());
if (Sz == VectorizableTree.size()) {
@@ -6908,6 +6965,21 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Final attempt to vectorize non-vectorized loads.
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
}
+ // Try to vectorize postponed load entries, previously marked as gathered.
+ for (unsigned Idx : LoadEntriesToVectorize) {
+ const TreeEntry &E = *VectorizableTree[Idx];
+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(),
+ E.Scalars.end());
+ // Avoid reordering, if possible.
+ if (!E.ReorderIndices.empty()) {
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E.ReorderIndices, ReorderMask);
+ reorderScalars(GatheredScalars, ReorderMask);
+ }
+ buildTree_rec(GatheredScalars, 0, EdgeInfo());
+ }
// If no new entries created, consider it as no gathered loads entries must be
// handled.
if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
@@ -7220,6 +7292,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ // Delay slow vectorized nodes for better vectorization attempts.
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
+ return TreeEntry::NeedToGather;
+ }
return TreeEntry::ScatterVectorize;
case LoadsState::StridedVectorize:
return TreeEntry::StridedVectorize;
@@ -9057,6 +9134,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
+ // Turn graph transforming mode on and off, when done.
+ class GraphTransformModeRAAI {
+ bool &SavedIsGraphTransformMode;
+
+ public:
+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
+ IsGraphTransformMode = true;
+ }
+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+ } TransformContext(IsGraphTransformMode);
// Operands are profitable if they are:
// 1. At least one constant
// or
@@ -9089,7 +9177,7 @@ void BoUpSLP::transformNodes() {
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
- if (VL.size() <= 2 ||
+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL))
@@ -9187,6 +9275,8 @@ void BoUpSLP::transformNodes() {
continue;
}
unsigned PrevSize = VectorizableTree.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
if (PrevSize + 1 == VectorizableTree.size() &&
VectorizableTree[PrevSize]->isGather() &&
@@ -9194,6 +9284,8 @@ void BoUpSLP::transformNodes() {
Instruction::ExtractElement &&
!isSplat(Slice)) {
VectorizableTree.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
continue;
}
AddCombinedNode(PrevSize, Cnt);
@@ -9279,17 +9371,19 @@ void BoUpSLP::transformNodes() {
}
}
- // Single load node - exit.
- if (VectorizableTree.size() <= 1 &&
- VectorizableTree.front()->getOpcode() == Instruction::Load)
- return;
- // Small graph with small VF - exit.
- constexpr unsigned SmallTree = 3;
- constexpr unsigned SmallVF = 2;
- if ((VectorizableTree.size() <= SmallTree &&
- VectorizableTree.front()->Scalars.size() == SmallVF) ||
- (VectorizableTree.size() <= 2 && UserIgnoreList))
- return;
+ if (LoadEntriesToVectorize.empty()) {
+ // Single load node - exit.
+ if (VectorizableTree.size() <= 1 &&
+ VectorizableTree.front()->getOpcode() == Instruction::Load)
+ return;
+ // Small graph with small VF - exit.
+ constexpr unsigned SmallTree = 3;
+ constexpr unsigned SmallVF = 2;
+ if ((VectorizableTree.size() <= SmallTree &&
+ VectorizableTree.front()->Scalars.size() == SmallVF) ||
+ (VectorizableTree.size() <= 2 && UserIgnoreList))
+ return;
+ }
// A list of loads to be gathered during the vectorization process. We can
// try to vectorize them at the end, if profitable.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..823ba8f6b8b6aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -70,65 +70,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT: [[TMP51:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP58]], [[TMP39]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP50]], [[TMP52]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP64:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
-; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP76]], [[TMP82]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
+; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
+; CHECK-NEXT: [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP50]], i32 0
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
@@ -155,27 +152,27 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP100]], [[TMP89]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP109]], [[TMP115]]
; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT: [[TMP132:%.*]] = sub <2 x i32> [[TMP97]], [[TMP112]]
-; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP132]]
+; CHECK-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP108]]
+; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP86]], [[TMP108]]
-; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP133]]
+; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
+; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
@@ -185,22 +182,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
; CHECK-NEXT: [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
; CHECK-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
; CHECK-NEXT: [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
+; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP120]], [[TMP128]]
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP123]], [[TMP115]]
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP134]]
+; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP112]]
+; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP128]]
+; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP131]]
; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -214,23 +211,23 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT: [[TMP154:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT: [[TMP164:%.*]] = add <2 x i32> [[TMP145]], [[TMP157]]
-; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP157]], [[TMP145]]
+; CHECK-NEXT: [[TMP164:%.*]] = add <2 x i32> [[TMP154]], [[TMP145]]
+; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP145]], [[TMP154]]
; CHECK-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
; CHECK-NEXT: [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
; CHECK-NEXT: [[TMP180:%.*]] = add <2 x i32> [[TMP165]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
-; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP145]], i32 0
+; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
+; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
@@ -297,17 +294,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[TMP190]], [[XOR_I_1]]
+; CHECK-NEXT: [[TMP187:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[TMP187]], [[XOR_I_1]]
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP191]], [[TMP205]]
+; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP188]], [[TMP190]]
; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT: [[TMP206:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
+; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
@@ -315,8 +312,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP206]], i32 0
-; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP206]], i32 1
+; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP208]], i32 0
+; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP208]], i32 1
; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
@@ -329,13 +326,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP208:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP208]]
-; CHECK-NEXT: [[TMP209:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP209]]
+; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP205]]
+; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP206]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
-; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -360,10 +357,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP228]]
-; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP229]]
+; CHECK-NEXT: [[TMP221:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP221]]
+; CHECK-NEXT: [[TMP222:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP222]]
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
@@ -423,82 +420,81 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; THR15-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
; THR15-NEXT: [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
-; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP47]]
+; THR15-NEXT: [[TMP44:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
+; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP44]]
; THR15-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
; THR15-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT: [[TMP49:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT: [[TMP51:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
; THR15-NEXT: [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP59]], [[TMP78]]
+; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
; THR15-NEXT: [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
; THR15-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP80]], [[TMP39]]
+; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP52]], [[TMP39]]
; THR15-NEXT: [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; THR15-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
; THR15-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP83]]
-; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP83]], [[TMP29]]
-; THR15-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP68]]
+; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP29]]
+; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
; THR15-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP87]]
-; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP87]], [[TMP31]]
+; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP45]]
+; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP31]]
; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; THR15-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; THR15-NEXT: [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP48]], [[TMP50]]
; THR15-NEXT: [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
; THR15-NEXT: [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
-; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; THR15-NEXT: [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[TMP61:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; THR15-NEXT: [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP98]], [[TMP101]]
-; THR15-NEXT: [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; THR15-NEXT: [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT: [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; THR15-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; THR15-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; THR15-NEXT: [[TMP70:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; THR15-NEXT: [[TMP87:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; THR15-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP87]], [[TMP96]]
; THR15-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
-; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP104]]
-; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP60]]
-; THR15-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT: [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP76]]
-; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP76]], [[TMP62]]
-; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT: [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT: [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT: [[TMP98:%.*]] = add <2 x i32> [[TMP58]], [[TMP43]]
+; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP43]], [[TMP58]]
+; THR15-NEXT: [[TMP102:%.*]] = extractelement <2 x i32> [[TMP98]], i32 0
+; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP98]], i32 1
+; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP104]], [[TMP102]]
+; THR15-NEXT: [[TMP108:%.*]] = insertelement <2 x i32> [[TMP98]], i32 [[ADD44_2]], i32 1
+; THR15-NEXT: [[TMP76:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP109:%.*]] = insertelement <2 x i32> [[TMP76]], i32 [[ADD46_2]], i32 1
+; THR15-NEXT: [[TMP78:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT: [[TMP110:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
+; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP80]], [[TMP110]]
+; THR15-NEXT: [[TMP81:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB45_2]], i32 1
+; THR15-NEXT: [[TMP111:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP111]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP81]], [[TMP83]]
; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; THR15-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP48]], i32 0
@@ -518,39 +514,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; THR15-NEXT: [[TMP115:%.*]] = extractelement <2 x i32> [[TMP78]], i32 0
+; THR15-NEXT: [[TMP116:%.*]] = extractelement <2 x i32> [[TMP78]], i32 1
+; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP115]], [[TMP116]]
+; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP116]], [[TMP115]]
; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; THR15-NEXT: [[TMP117:%.*]] = extractelement <2 x i32> [[TMP112]], i32 0
+; THR15-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP112]], i32 1
+; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP117]], [[TMP131]]
+; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP131]], [[TMP117]]
; THR15-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
; THR15-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; THR15-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; THR15-NEXT: [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; THR15-NEXT: [[TMP132:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP147]], [[TMP99]]
; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; THR15-NEXT: [[TMP149:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
+; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
+; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP149]], [[TMP107]]
; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP111]]
+; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP148]]
; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP107]]
+; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP133]]
; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
@@ -559,50 +559,50 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT: [[TMP110:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
; THR15-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP110]]
+; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP151]]
; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT: [[TMP112:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
-; THR15-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; THR15-NEXT: [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP132]]
+; THR15-NEXT: [[TMP159:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; THR15-NEXT: [[TMP161:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT: [[TMP175:%.*]] = zext <2 x i8> [[TMP161]] to <2 x i32>
+; THR15-NEXT: [[TMP179:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP179]] to <2 x i32>
+; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT: [[TMP180:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP128]], [[TMP180]]
; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
; THR15-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP115]]
+; THR15-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP175]]
; THR15-NEXT: [[TMP121:%.*]] = add <2 x i32> [[TMP114]], [[TMP134]]
; THR15-NEXT: [[TMP145:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; THR15-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP145]], i8 [[TMP3]], i32 1
-; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; THR15-NEXT: [[TMP146:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP9]], i32 0
-; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
+; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP139]]
; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP119]]
-; THR15-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; THR15-NEXT: [[TMP181:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
; THR15-NEXT: [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP125]]
+; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP181]], [[TMP125]]
; THR15-NEXT: [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
; THR15-NEXT: [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
; THR15-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
; THR15-NEXT: [[TMP138:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
; THR15-NEXT: [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; THR15-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
+; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
; THR15-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP138]], i32 1
-; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP139]]
-; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP150]]
+; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP140]]
; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
; THR15-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP91]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THR15-NEXT: [[TMP142:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB45_1]], i32 0
@@ -673,15 +673,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
+; THR15-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
+; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP182]]
; THR15-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
+; THR15-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
; THR15-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP178]]
-; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP159]]
+; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP183]], [[TMP178]]
+; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP183]]
; THR15-NEXT: [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
; THR15-NEXT: [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
; THR15-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -704,8 +704,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
+; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
+; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bb806be15c71ca..09612444afd205 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,20 +8,17 @@
; YAML-NEXT: Function: test
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
-; YAML-NEXT: - Cost: '2'
+; YAML-NEXT: - Cost: '0'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '7'
+; YAML-NEXT: - TreeSize: '9'
define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..9c1da08c64b7b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3ac..98333c7b420cf0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
; CHECK-LABEL: define <4 x i32> @test(
; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22b..59b0352a825929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - String: ' and with tree size '
- ; YAML-NEXT: - TreeSize: '7'
+ ; YAML-NEXT: - TreeSize: '8'
entry:
%off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
%idx0 = load i32, ptr %off0.1, align 8
More information about the llvm-commits
mailing list