[llvm] [SLP]Improve masked loads vectorization, attempting gathered loads (PR #110151)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 8 09:07:57 PDT 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/110151
>From e4768e045c2f56ccfdf0cbeeaacb0cacd754fb22 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 26 Sep 2024 18:09:51 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 142 +++++--
.../SLPVectorizer/RISCV/complex-loads.ll | 358 +++++++++---------
.../RISCV/remarks-insert-into-small-vector.ll | 11 +-
...reversed-strided-node-with-external-ptr.ll | 2 +-
.../RISCV/scatter-vectorize-reversed.ll | 6 +-
.../X86/remark_gather-load-redux-cost.ll | 2 +-
6 files changed, 306 insertions(+), 215 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53d7ae606ffeea..62c77704d92eb5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1368,6 +1368,8 @@ class BoUpSLP {
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
+ LoadEntriesToVectorize.clear();
+ IsGraphTransformMode = false;
GatheredLoadsEntriesFirst = NoGatheredLoads;
ExternalUses.clear();
ExternalUsesAsOriginalScalar.clear();
@@ -3610,6 +3612,13 @@ class BoUpSLP {
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
ValueToGatherNodesMap ValueToGatherNodes;
+ /// A list of the loads, which can be vectorized using strided or masked
+ /// gather approach, but attempted to be represented as contiguous loads.
+ SetVector<unsigned> LoadEntriesToVectorize;
+
+ /// true if graph nodes transforming mode is on.
+ bool IsGraphTransformMode = false;
+
/// The index of the first gathered load entry in the VectorizeTree.
constexpr static int NoGatheredLoads = -1;
int GatheredLoadsEntriesFirst = NoGatheredLoads;
@@ -4612,17 +4621,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
return false;
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
- if (!GEP1)
- return false;
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
- if (!GEP2)
- return false;
- return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
- ((isConstant(GEP1->getOperand(1)) &&
- isConstant(GEP2->getOperand(1))) ||
+ return (!GEP1 || GEP1->getNumOperands() == 2) &&
+ (!GEP2 || GEP2->getNumOperands() == 2) &&
+ (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
+ (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
!CompareOpcodes ||
- getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
- .getOpcode());
+ (GEP1 && GEP2 &&
+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+ .getOpcode()));
}
/// Calculates minimal alignment as a common alignment.
@@ -5112,10 +5119,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
return L->isLoopInvariant(V);
})) <= Sz / 2;
- if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+ if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
- return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
- (GEP && GEP->getNumOperands() == 2 &&
+ return (!GEP && doesNotNeedToBeScheduled(P)) ||
+ (GEP->getNumOperands() == 2 &&
isa<Constant, Instruction>(GEP->getOperand(1)));
})) {
// Check if potential masked gather can be represented as series
@@ -6607,6 +6614,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
GatheredLoadsEntriesFirst = VectorizableTree.size();
+ SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+ LoadEntriesToVectorize.size());
+ for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+ Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+ VectorizableTree[Idx]->Scalars.end());
+
// Sort loads by distance.
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
const std::pair<LoadInst *, int> &L2) {
@@ -6864,8 +6877,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
}
}
+ // Cannot represent the loads as consecutive vectorizable nodes -
+ // just exit.
+ unsigned ConsecutiveNodesSize = 0;
+ if (!LoadEntriesToVectorize.empty() &&
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ ArrayRef<Value *> VL =
+ VectorizableTree[std::get<0>(P)]->Scalars;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
+ continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
+ if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+ [&, Slice = Slice](unsigned Idx) {
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ return canVectorizeLoads(
+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
+ Slice[Idx * UserMaxVF], Order,
+ PointerOps) ==
+ LoadsState::ScatterVectorize;
+ }))
+ UserMaxVF = MaxVF;
+ if (Slice.size() != ConsecutiveNodesSize)
+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
}
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
bool IsVectorized = true;
@@ -6874,6 +6921,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
Slice.slice(I, std::min(VF, E - I));
if (getTreeEntry(SubSlice.front()))
continue;
+ // Check if the subslice is to be-vectorized entry, which is not
+ // equal to entry.
+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&](const auto &P) {
+ return !SubSlice.equals(
+ VectorizableTree[std::get<0>(P)]
+ ->Scalars) &&
+ set_is_subset(SubSlice, std::get<1>(P));
+ }))
+ continue;
unsigned Sz = VectorizableTree.size();
buildTree_rec(SubSlice, 0, EdgeInfo());
if (Sz == VectorizableTree.size()) {
@@ -6908,6 +6965,21 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Final attempt to vectorize non-vectorized loads.
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
}
+ // Try to vectorize postponed load entries, previously marked as gathered.
+ for (unsigned Idx : LoadEntriesToVectorize) {
+ const TreeEntry &E = *VectorizableTree[Idx];
+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(),
+ E.Scalars.end());
+ // Avoid reordering, if possible.
+ if (!E.ReorderIndices.empty()) {
+ // Build a mask out of the reorder indices and reorder scalars per this
+ // mask.
+ SmallVector<int> ReorderMask;
+ inversePermutation(E.ReorderIndices, ReorderMask);
+ reorderScalars(GatheredScalars, ReorderMask);
+ }
+ buildTree_rec(GatheredScalars, 0, EdgeInfo());
+ }
// If no new entries created, consider it as no gathered loads entries must be
// handled.
if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
@@ -7220,6 +7292,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::ScatterVectorize:
+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ // Delay slow vectorized nodes for better vectorization attempts.
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
+ return TreeEntry::NeedToGather;
+ }
return TreeEntry::ScatterVectorize;
case LoadsState::StridedVectorize:
return TreeEntry::StridedVectorize;
@@ -9057,6 +9134,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
+ // Turn graph transforming mode on and off, when done.
+ class GraphTransformModeRAAI {
+ bool &SavedIsGraphTransformMode;
+
+ public:
+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
+ IsGraphTransformMode = true;
+ }
+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+ } TransformContext(IsGraphTransformMode);
// Operands are profitable if they are:
// 1. At least one constant
// or
@@ -9089,7 +9177,7 @@ void BoUpSLP::transformNodes() {
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
- if (VL.size() <= 2 ||
+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL))
@@ -9187,6 +9275,8 @@ void BoUpSLP::transformNodes() {
continue;
}
unsigned PrevSize = VectorizableTree.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
if (PrevSize + 1 == VectorizableTree.size() &&
VectorizableTree[PrevSize]->isGather() &&
@@ -9194,6 +9284,8 @@ void BoUpSLP::transformNodes() {
Instruction::ExtractElement &&
!isSplat(Slice)) {
VectorizableTree.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
continue;
}
AddCombinedNode(PrevSize, Cnt);
@@ -9279,17 +9371,19 @@ void BoUpSLP::transformNodes() {
}
}
- // Single load node - exit.
- if (VectorizableTree.size() <= 1 &&
- VectorizableTree.front()->getOpcode() == Instruction::Load)
- return;
- // Small graph with small VF - exit.
- constexpr unsigned SmallTree = 3;
- constexpr unsigned SmallVF = 2;
- if ((VectorizableTree.size() <= SmallTree &&
- VectorizableTree.front()->Scalars.size() == SmallVF) ||
- (VectorizableTree.size() <= 2 && UserIgnoreList))
- return;
+ if (LoadEntriesToVectorize.empty()) {
+ // Single load node - exit.
+ if (VectorizableTree.size() <= 1 &&
+ VectorizableTree.front()->getOpcode() == Instruction::Load)
+ return;
+ // Small graph with small VF - exit.
+ constexpr unsigned SmallTree = 3;
+ constexpr unsigned SmallVF = 2;
+ if ((VectorizableTree.size() <= SmallTree &&
+ VectorizableTree.front()->Scalars.size() == SmallVF) ||
+ (VectorizableTree.size() <= 2 && UserIgnoreList))
+ return;
+ }
// A list of loads to be gathered during the vectorization process. We can
// try to vectorize them at the end, if profitable.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..823ba8f6b8b6aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -70,65 +70,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT: [[TMP51:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP58]], [[TMP39]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP50]], [[TMP52]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP64:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
-; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP76]], [[TMP82]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
+; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
+; CHECK-NEXT: [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP50]], i32 0
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
@@ -155,27 +152,27 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP100]], [[TMP89]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP109]], [[TMP115]]
; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT: [[TMP132:%.*]] = sub <2 x i32> [[TMP97]], [[TMP112]]
-; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP132]]
+; CHECK-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP108]]
+; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP86]], [[TMP108]]
-; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP133]]
+; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
+; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
@@ -185,22 +182,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
; CHECK-NEXT: [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
; CHECK-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
; CHECK-NEXT: [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
+; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP120]], [[TMP128]]
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP123]], [[TMP115]]
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP134]]
+; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP112]]
+; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP128]]
+; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP131]]
; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -214,23 +211,23 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT: [[TMP154:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT: [[TMP164:%.*]] = add <2 x i32> [[TMP145]], [[TMP157]]
-; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP157]], [[TMP145]]
+; CHECK-NEXT: [[TMP164:%.*]] = add <2 x i32> [[TMP154]], [[TMP145]]
+; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP145]], [[TMP154]]
; CHECK-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
; CHECK-NEXT: [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
; CHECK-NEXT: [[TMP180:%.*]] = add <2 x i32> [[TMP165]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
-; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP145]], i32 0
+; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
+; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
@@ -297,17 +294,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[TMP190]], [[XOR_I_1]]
+; CHECK-NEXT: [[TMP187:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[TMP187]], [[XOR_I_1]]
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP191]], [[TMP205]]
+; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP188]], [[TMP190]]
; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT: [[TMP206:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
+; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
@@ -315,8 +312,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP206]], i32 0
-; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP206]], i32 1
+; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP208]], i32 0
+; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP208]], i32 1
; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
@@ -329,13 +326,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP208:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP208]]
-; CHECK-NEXT: [[TMP209:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP209]]
+; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP205]]
+; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP206]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
-; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -360,10 +357,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP228]]
-; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP229]]
+; CHECK-NEXT: [[TMP221:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP221]]
+; CHECK-NEXT: [[TMP222:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP222]]
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
@@ -423,82 +420,81 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; THR15-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
; THR15-NEXT: [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
-; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP47]]
+; THR15-NEXT: [[TMP44:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
+; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP44]]
; THR15-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
; THR15-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT: [[TMP49:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT: [[TMP51:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
; THR15-NEXT: [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP59]], [[TMP78]]
+; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
; THR15-NEXT: [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
; THR15-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP80]], [[TMP39]]
+; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP52]], [[TMP39]]
; THR15-NEXT: [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; THR15-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
; THR15-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP83]]
-; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP83]], [[TMP29]]
-; THR15-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP68]]
+; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP29]]
+; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
; THR15-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP87]]
-; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP87]], [[TMP31]]
+; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP45]]
+; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP31]]
; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; THR15-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; THR15-NEXT: [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP48]], [[TMP50]]
; THR15-NEXT: [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
; THR15-NEXT: [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
-; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; THR15-NEXT: [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[TMP61:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; THR15-NEXT: [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP98]], [[TMP101]]
-; THR15-NEXT: [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; THR15-NEXT: [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT: [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; THR15-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; THR15-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; THR15-NEXT: [[TMP70:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; THR15-NEXT: [[TMP87:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; THR15-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP87]], [[TMP96]]
; THR15-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
-; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP104]]
-; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP60]]
-; THR15-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT: [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP76]]
-; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP76]], [[TMP62]]
-; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT: [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT: [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT: [[TMP98:%.*]] = add <2 x i32> [[TMP58]], [[TMP43]]
+; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP43]], [[TMP58]]
+; THR15-NEXT: [[TMP102:%.*]] = extractelement <2 x i32> [[TMP98]], i32 0
+; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP98]], i32 1
+; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP104]], [[TMP102]]
+; THR15-NEXT: [[TMP108:%.*]] = insertelement <2 x i32> [[TMP98]], i32 [[ADD44_2]], i32 1
+; THR15-NEXT: [[TMP76:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP109:%.*]] = insertelement <2 x i32> [[TMP76]], i32 [[ADD46_2]], i32 1
+; THR15-NEXT: [[TMP78:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT: [[TMP110:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
+; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP80]], [[TMP110]]
+; THR15-NEXT: [[TMP81:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB45_2]], i32 1
+; THR15-NEXT: [[TMP111:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP111]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP81]], [[TMP83]]
; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; THR15-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP48]], i32 0
@@ -518,39 +514,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; THR15-NEXT: [[TMP115:%.*]] = extractelement <2 x i32> [[TMP78]], i32 0
+; THR15-NEXT: [[TMP116:%.*]] = extractelement <2 x i32> [[TMP78]], i32 1
+; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP115]], [[TMP116]]
+; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP116]], [[TMP115]]
; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; THR15-NEXT: [[TMP117:%.*]] = extractelement <2 x i32> [[TMP112]], i32 0
+; THR15-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP112]], i32 1
+; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP117]], [[TMP131]]
+; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP131]], [[TMP117]]
; THR15-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
; THR15-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; THR15-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; THR15-NEXT: [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; THR15-NEXT: [[TMP132:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP147]], [[TMP99]]
; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; THR15-NEXT: [[TMP149:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
+; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
+; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP149]], [[TMP107]]
; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP111]]
+; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP148]]
; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP107]]
+; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP133]]
; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
@@ -559,50 +559,50 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT: [[TMP110:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
; THR15-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP110]]
+; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP151]]
; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT: [[TMP112:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
-; THR15-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; THR15-NEXT: [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP132]]
+; THR15-NEXT: [[TMP159:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; THR15-NEXT: [[TMP161:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT: [[TMP175:%.*]] = zext <2 x i8> [[TMP161]] to <2 x i32>
+; THR15-NEXT: [[TMP179:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP179]] to <2 x i32>
+; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT: [[TMP180:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP128]], [[TMP180]]
; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
; THR15-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP115]]
+; THR15-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP175]]
; THR15-NEXT: [[TMP121:%.*]] = add <2 x i32> [[TMP114]], [[TMP134]]
; THR15-NEXT: [[TMP145:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; THR15-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP145]], i8 [[TMP3]], i32 1
-; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; THR15-NEXT: [[TMP146:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP9]], i32 0
-; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
+; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP139]]
; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP119]]
-; THR15-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; THR15-NEXT: [[TMP181:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
; THR15-NEXT: [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP125]]
+; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP181]], [[TMP125]]
; THR15-NEXT: [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
; THR15-NEXT: [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
; THR15-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
; THR15-NEXT: [[TMP138:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
; THR15-NEXT: [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; THR15-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
+; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
; THR15-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP138]], i32 1
-; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP139]]
-; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP150]]
+; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP140]]
; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
; THR15-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP91]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THR15-NEXT: [[TMP142:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB45_1]], i32 0
@@ -673,15 +673,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
+; THR15-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
+; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP182]]
; THR15-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
+; THR15-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
; THR15-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP178]]
-; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP159]]
+; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP183]], [[TMP178]]
+; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP183]]
; THR15-NEXT: [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
; THR15-NEXT: [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
; THR15-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -704,8 +704,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
+; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
+; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bb806be15c71ca..09612444afd205 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,20 +8,17 @@
; YAML-NEXT: Function: test
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
-; YAML-NEXT: - Cost: '2'
+; YAML-NEXT: - Cost: '0'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '7'
+; YAML-NEXT: - TreeSize: '9'
define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..9c1da08c64b7b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3ac..98333c7b420cf0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
; CHECK-LABEL: define <4 x i32> @test(
; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22b..59b0352a825929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - String: ' and with tree size '
- ; YAML-NEXT: - TreeSize: '7'
+ ; YAML-NEXT: - TreeSize: '8'
entry:
%off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
%idx0 = load i32, ptr %off0.1, align 8
>From 651c220ab3e50fbb43568b8ae7c0c05dfd34a451 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 26 Sep 2024 18:27:26 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 62c77704d92eb5..875e5156915dbd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6968,8 +6968,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Try to vectorize postponed load entries, previously marked as gathered.
for (unsigned Idx : LoadEntriesToVectorize) {
const TreeEntry &E = *VectorizableTree[Idx];
- SmallVector<Value *> GatheredScalars(E.Scalars.begin(),
- E.Scalars.end());
+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
// Avoid reordering, if possible.
if (!E.ReorderIndices.empty()) {
// Build a mask out of the reorder indices and reorder scalars per this
More information about the llvm-commits
mailing list