[llvm] [SLP]Improve masked loads vectorization, attempting gathered loads (PR #110151)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 8 09:07:57 PDT 2024


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/110151

>From e4768e045c2f56ccfdf0cbeeaacb0cacd754fb22 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 26 Sep 2024 18:09:51 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 142 +++++--
 .../SLPVectorizer/RISCV/complex-loads.ll      | 358 +++++++++---------
 .../RISCV/remarks-insert-into-small-vector.ll |  11 +-
 ...reversed-strided-node-with-external-ptr.ll |   2 +-
 .../RISCV/scatter-vectorize-reversed.ll       |   6 +-
 .../X86/remark_gather-load-redux-cost.ll      |   2 +-
 6 files changed, 306 insertions(+), 215 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53d7ae606ffeea..62c77704d92eb5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1368,6 +1368,8 @@ class BoUpSLP {
     MustGather.clear();
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
+    LoadEntriesToVectorize.clear();
+    IsGraphTransformMode = false;
     GatheredLoadsEntriesFirst = NoGatheredLoads;
     ExternalUses.clear();
     ExternalUsesAsOriginalScalar.clear();
@@ -3610,6 +3612,13 @@ class BoUpSLP {
       DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
   ValueToGatherNodesMap ValueToGatherNodes;
 
+  /// A list of the loads, which can be vectorized using strided or masked
+  /// gather approach, but attempted to be represented as contiguous loads.
+  SetVector<unsigned> LoadEntriesToVectorize;
+
+  /// true if graph nodes transforming mode is on.
+  bool IsGraphTransformMode = false;
+
   /// The index of the first gathered load entry in the VectorizeTree.
   constexpr static int NoGatheredLoads = -1;
   int GatheredLoadsEntriesFirst = NoGatheredLoads;
@@ -4612,17 +4621,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
   if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
     return false;
   auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
-  if (!GEP1)
-    return false;
   auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
-  if (!GEP2)
-    return false;
-  return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
-         ((isConstant(GEP1->getOperand(1)) &&
-           isConstant(GEP2->getOperand(1))) ||
+  return (!GEP1 || GEP1->getNumOperands() == 2) &&
+         (!GEP2 || GEP2->getNumOperands() == 2) &&
+         (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
+           (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
           !CompareOpcodes ||
-          getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
-              .getOpcode());
+          (GEP1 && GEP2 &&
+           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+               .getOpcode()));
 }
 
 /// Calculates minimal alignment as a common alignment.
@@ -5112,10 +5119,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
                        return L->isLoopInvariant(V);
                      })) <= Sz / 2;
-  if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+  if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
         auto *GEP = dyn_cast<GetElementPtrInst>(P);
-        return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
-               (GEP && GEP->getNumOperands() == 2 &&
+        return (!GEP && doesNotNeedToBeScheduled(P)) ||
+               (GEP->getNumOperands() == 2 &&
                 isa<Constant, Instruction>(GEP->getOperand(1)));
       })) {
     // Check if potential masked gather can be represented as series
@@ -6607,6 +6614,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
   GatheredLoadsEntriesFirst = VectorizableTree.size();
 
+  SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+      LoadEntriesToVectorize.size());
+  for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+    Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+               VectorizableTree[Idx]->Scalars.end());
+
   // Sort loads by distance.
   auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
                        const std::pair<LoadInst *, int> &L2) {
@@ -6864,8 +6877,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                   }
                 }
               }
+              // Cannot represent the loads as consecutive vectorizable nodes -
+              // just exit.
+              unsigned ConsecutiveNodesSize = 0;
+              if (!LoadEntriesToVectorize.empty() &&
+                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                         [&, Slice = Slice](const auto &P) {
+                           const auto *It = find_if(Slice, [&](Value *V) {
+                             return std::get<1>(P).contains(V);
+                           });
+                           if (It == Slice.end())
+                             return false;
+                           ArrayRef<Value *> VL =
+                               VectorizableTree[std::get<0>(P)]->Scalars;
+                           ConsecutiveNodesSize += VL.size();
+                           unsigned Start = std::distance(Slice.begin(), It);
+                           unsigned Sz = Slice.size() - Start;
+                           return Sz < VL.size() ||
+                                  Slice.slice(std::distance(Slice.begin(), It),
+                                              VL.size()) != VL;
+                         }))
+                continue;
               // Try to build long masked gather loads.
               UserMaxVF = bit_ceil(UserMaxVF);
+              if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+                         [&, Slice = Slice](unsigned Idx) {
+                           OrdersType Order;
+                           SmallVector<Value *> PointerOps;
+                           return canVectorizeLoads(
+                                      Slice.slice(Idx * UserMaxVF, UserMaxVF),
+                                      Slice[Idx * UserMaxVF], Order,
+                                      PointerOps) ==
+                                  LoadsState::ScatterVectorize;
+                         }))
+                UserMaxVF = MaxVF;
+              if (Slice.size() != ConsecutiveNodesSize)
+                MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
             }
             for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
               bool IsVectorized = true;
@@ -6874,6 +6921,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                     Slice.slice(I, std::min(VF, E - I));
                 if (getTreeEntry(SubSlice.front()))
                   continue;
+                // Check if the subslice is to be-vectorized entry, which is not
+                // equal to entry.
+                if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                           [&](const auto &P) {
+                             return !SubSlice.equals(
+                                        VectorizableTree[std::get<0>(P)]
+                                            ->Scalars) &&
+                                    set_is_subset(SubSlice, std::get<1>(P));
+                           }))
+                  continue;
                 unsigned Sz = VectorizableTree.size();
                 buildTree_rec(SubSlice, 0, EdgeInfo());
                 if (Sz == VectorizableTree.size()) {
@@ -6908,6 +6965,21 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     // Final attempt to vectorize non-vectorized loads.
     (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
   }
+  // Try to vectorize postponed load entries, previously marked as gathered.
+  for (unsigned Idx : LoadEntriesToVectorize) {
+    const TreeEntry &E = *VectorizableTree[Idx];
+    SmallVector<Value *> GatheredScalars(E.Scalars.begin(),
+                                         E.Scalars.end());
+    // Avoid reordering, if possible.
+    if (!E.ReorderIndices.empty()) {
+      // Build a mask out of the reorder indices and reorder scalars per this
+      // mask.
+      SmallVector<int> ReorderMask;
+      inversePermutation(E.ReorderIndices, ReorderMask);
+      reorderScalars(GatheredScalars, ReorderMask);
+    }
+    buildTree_rec(GatheredScalars, 0, EdgeInfo());
+  }
   // If no new entries created, consider it as no gathered loads entries must be
   // handled.
   if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
@@ -7220,6 +7292,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
+      if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+        // Delay slow vectorized nodes for better vectorization attempts.
+        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        return TreeEntry::NeedToGather;
+      }
       return TreeEntry::ScatterVectorize;
     case LoadsState::StridedVectorize:
       return TreeEntry::StridedVectorize;
@@ -9057,6 +9134,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
+  // Turn graph transforming mode on and off, when done.
+  class GraphTransformModeRAAI {
+    bool &SavedIsGraphTransformMode;
+
+  public:
+    GraphTransformModeRAAI(bool &IsGraphTransformMode)
+        : SavedIsGraphTransformMode(IsGraphTransformMode) {
+      IsGraphTransformMode = true;
+    }
+    ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+  } TransformContext(IsGraphTransformMode);
   // Operands are profitable if they are:
   // 1. At least one constant
   // or
@@ -9089,7 +9177,7 @@ void BoUpSLP::transformNodes() {
       unsigned MinVF = getMinVF(2 * Sz);
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
-      if (VL.size() <= 2 ||
+      if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
           !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
@@ -9187,6 +9275,8 @@ void BoUpSLP::transformNodes() {
             continue;
           }
           unsigned PrevSize = VectorizableTree.size();
+          [[maybe_unused]] unsigned PrevEntriesSize =
+              LoadEntriesToVectorize.size();
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
           if (PrevSize + 1 == VectorizableTree.size() &&
               VectorizableTree[PrevSize]->isGather() &&
@@ -9194,6 +9284,8 @@ void BoUpSLP::transformNodes() {
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
             VectorizableTree.pop_back();
+            assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                   "LoadEntriesToVectorize expected to remain the same");
             continue;
           }
           AddCombinedNode(PrevSize, Cnt);
@@ -9279,17 +9371,19 @@ void BoUpSLP::transformNodes() {
     }
   }
 
-  // Single load node - exit.
-  if (VectorizableTree.size() <= 1 &&
-      VectorizableTree.front()->getOpcode() == Instruction::Load)
-    return;
-  // Small graph with small VF - exit.
-  constexpr unsigned SmallTree = 3;
-  constexpr unsigned SmallVF = 2;
-  if ((VectorizableTree.size() <= SmallTree &&
-       VectorizableTree.front()->Scalars.size() == SmallVF) ||
-      (VectorizableTree.size() <= 2 && UserIgnoreList))
-    return;
+  if (LoadEntriesToVectorize.empty()) {
+    // Single load node - exit.
+    if (VectorizableTree.size() <= 1 &&
+        VectorizableTree.front()->getOpcode() == Instruction::Load)
+      return;
+    // Small graph with small VF - exit.
+    constexpr unsigned SmallTree = 3;
+    constexpr unsigned SmallVF = 2;
+    if ((VectorizableTree.size() <= SmallTree &&
+         VectorizableTree.front()->Scalars.size() == SmallVF) ||
+        (VectorizableTree.size() <= 2 && UserIgnoreList))
+      return;
+  }
 
   // A list of loads to be gathered during the vectorization process. We can
   // try to vectorize them at the end, if profitable.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..823ba8f6b8b6aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -70,65 +70,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT:    [[TMP51:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP58]], [[TMP39]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT:    [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
 ; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
 ; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
 ; CHECK-NEXT:    [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP50]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP64:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP76]], [[TMP82]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
+; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
+; CHECK-NEXT:    [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
 ; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
 ; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP50]], i32 0
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
@@ -155,27 +152,27 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT:    [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP100]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT:    [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP109]], [[TMP115]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP132:%.*]] = sub <2 x i32> [[TMP97]], [[TMP112]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP132]]
+; CHECK-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP108]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP86]], [[TMP108]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP133]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
 ; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
@@ -185,22 +182,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
+; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP120]], [[TMP128]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP123]], [[TMP115]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP134]]
+; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP112]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
 ; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP131]]
 ; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -214,23 +211,23 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT:    [[TMP145:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
 ; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP145:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT:    [[TMP154:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
 ; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP164:%.*]] = add <2 x i32> [[TMP145]], [[TMP157]]
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP157]], [[TMP145]]
+; CHECK-NEXT:    [[TMP164:%.*]] = add <2 x i32> [[TMP154]], [[TMP145]]
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP145]], [[TMP154]]
 ; CHECK-NEXT:    [[TMP165:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
 ; CHECK-NEXT:    [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
 ; CHECK-NEXT:    [[TMP180:%.*]] = add <2 x i32> [[TMP165]], [[TMP152]]
-; CHECK-NEXT:    [[TMP154:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
-; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP145]], i32 0
+; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
+; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
@@ -297,17 +294,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[TMP190]], [[XOR_I_1]]
+; CHECK-NEXT:    [[TMP187:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[TMP187]], [[XOR_I_1]]
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP191]], [[TMP205]]
+; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP188]], [[TMP190]]
 ; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
 ; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP206:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
+; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
 ; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
 ; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
@@ -315,8 +312,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP206]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP206]], i32 1
+; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP208]], i32 0
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP208]], i32 1
 ; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
 ; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
@@ -329,13 +326,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP208:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP208]]
-; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP209]]
+; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP205]]
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP206]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
-; CHECK-NEXT:    [[SUB59:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT:    [[SUB59:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
 ; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
 ; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -360,10 +357,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP228]]
-; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP229]]
+; CHECK-NEXT:    [[TMP221:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP221]]
+; CHECK-NEXT:    [[TMP222:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP222]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
@@ -423,82 +420,81 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; THR15-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
 ; THR15-NEXT:    [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
-; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP47]]
+; THR15-NEXT:    [[TMP44:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
+; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP44]]
 ; THR15-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
 ; THR15-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
 ; THR15-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT:    [[TMP49:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT:    [[TMP51:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP51]] to <2 x i32>
 ; THR15-NEXT:    [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP59]], [[TMP78]]
+; THR15-NEXT:    [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
 ; THR15-NEXT:    [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
 ; THR15-NEXT:    [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
 ; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP80]], [[TMP39]]
+; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP52]], [[TMP39]]
 ; THR15-NEXT:    [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; THR15-NEXT:    [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
 ; THR15-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP83]]
-; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP83]], [[TMP29]]
-; THR15-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP68]]
+; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP29]]
+; THR15-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
 ; THR15-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP87]]
-; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP87]], [[TMP31]]
+; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP45]]
+; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP31]]
 ; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
 ; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT:    [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
 ; THR15-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT:    [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP48:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; THR15-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; THR15-NEXT:    [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP48:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT:    [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
 ; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP48]], [[TMP50]]
 ; THR15-NEXT:    [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
 ; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
 ; THR15-NEXT:    [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
-; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; THR15-NEXT:    [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP61:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; THR15-NEXT:    [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP98]], [[TMP101]]
-; THR15-NEXT:    [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; THR15-NEXT:    [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT:    [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; THR15-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; THR15-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; THR15-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; THR15-NEXT:    [[TMP70:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; THR15-NEXT:    [[TMP87:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; THR15-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP96:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP87]], [[TMP96]]
 ; THR15-NEXT:    [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
-; THR15-NEXT:    [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP104]]
-; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP60]]
-; THR15-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT:    [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP76]]
-; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP76]], [[TMP62]]
-; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT:    [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT:    [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT:    [[TMP98:%.*]] = add <2 x i32> [[TMP58]], [[TMP43]]
+; THR15-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP43]], [[TMP58]]
+; THR15-NEXT:    [[TMP102:%.*]] = extractelement <2 x i32> [[TMP98]], i32 0
+; THR15-NEXT:    [[TMP104:%.*]] = extractelement <2 x i32> [[TMP98]], i32 1
+; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP104]], [[TMP102]]
+; THR15-NEXT:    [[TMP108:%.*]] = insertelement <2 x i32> [[TMP98]], i32 [[ADD44_2]], i32 1
+; THR15-NEXT:    [[TMP76:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP109:%.*]] = insertelement <2 x i32> [[TMP76]], i32 [[ADD46_2]], i32 1
+; THR15-NEXT:    [[TMP78:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT:    [[TMP110:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; THR15-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
+; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP80]], [[TMP110]]
+; THR15-NEXT:    [[TMP81:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB45_2]], i32 1
+; THR15-NEXT:    [[TMP111:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP83:%.*]] = insertelement <2 x i32> [[TMP111]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP81]], [[TMP83]]
 ; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
 ; THR15-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP48]], i32 0
@@ -518,39 +514,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
 ; THR15-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; THR15-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; THR15-NEXT:    [[TMP115:%.*]] = extractelement <2 x i32> [[TMP78]], i32 0
+; THR15-NEXT:    [[TMP116:%.*]] = extractelement <2 x i32> [[TMP78]], i32 1
+; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP115]], [[TMP116]]
+; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[TMP116]], [[TMP115]]
 ; THR15-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
 ; THR15-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; THR15-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; THR15-NEXT:    [[TMP117:%.*]] = extractelement <2 x i32> [[TMP112]], i32 0
+; THR15-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP112]], i32 1
+; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP117]], [[TMP131]]
+; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP131]], [[TMP117]]
 ; THR15-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
 ; THR15-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; THR15-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; THR15-NEXT:    [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; THR15-NEXT:    [[TMP132:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
 ; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; THR15-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
 ; THR15-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP147]], [[TMP99]]
 ; THR15-NEXT:    [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
 ; THR15-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; THR15-NEXT:    [[TMP149:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; THR15-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP96:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
+; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
+; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP149]], [[TMP107]]
 ; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP111]]
+; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP148]]
 ; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
 ; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP107]]
+; THR15-NEXT:    [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP133]]
 ; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
@@ -559,50 +559,50 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
 ; THR15-NEXT:    [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
 ; THR15-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT:    [[TMP110:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; THR15-NEXT:    [[TMP151:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
 ; THR15-NEXT:    [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP110]]
+; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP151]]
 ; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
 ; THR15-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; THR15-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
 ; THR15-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
 ; THR15-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; THR15-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT:    [[TMP112:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; THR15-NEXT:    [[TMP129:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
-; THR15-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; THR15-NEXT:    [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP132]]
+; THR15-NEXT:    [[TMP159:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; THR15-NEXT:    [[TMP161:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP175:%.*]] = zext <2 x i8> [[TMP161]] to <2 x i32>
+; THR15-NEXT:    [[TMP179:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP179]] to <2 x i32>
+; THR15-NEXT:    [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP180:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP128]], [[TMP180]]
 ; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP115]]
+; THR15-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP175]]
 ; THR15-NEXT:    [[TMP121:%.*]] = add <2 x i32> [[TMP114]], [[TMP134]]
 ; THR15-NEXT:    [[TMP145:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
 ; THR15-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP145]], i8 [[TMP3]], i32 1
-; THR15-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; THR15-NEXT:    [[TMP139:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; THR15-NEXT:    [[TMP146:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP9]], i32 0
-; THR15-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
+; THR15-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP139]]
 ; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
 ; THR15-NEXT:    [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
 ; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
 ; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP119]]
-; THR15-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; THR15-NEXT:    [[TMP181:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
 ; THR15-NEXT:    [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP125]]
+; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP181]], [[TMP125]]
 ; THR15-NEXT:    [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
 ; THR15-NEXT:    [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
 ; THR15-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
 ; THR15-NEXT:    [[TMP138:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
 ; THR15-NEXT:    [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; THR15-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
+; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
 ; THR15-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP138]], i32 1
-; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP139]]
-; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP150]]
+; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP140]]
 ; THR15-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
 ; THR15-NEXT:    [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP91]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THR15-NEXT:    [[TMP142:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB45_1]], i32 0
@@ -673,15 +673,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; THR15-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
 ; THR15-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
+; THR15-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
+; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP182]]
 ; THR15-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
 ; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
 ; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
+; THR15-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
 ; THR15-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP178]]
-; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP159]]
+; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP183]], [[TMP178]]
+; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP183]]
 ; THR15-NEXT:    [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; THR15-NEXT:    [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; THR15-NEXT:    [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -704,8 +704,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; THR15-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; THR15-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
+; THR15-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
+; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
 ; THR15-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
 ; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
 ; THR15-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bb806be15c71ca..09612444afd205 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,20 +8,17 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:  - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:  - Cost:            '2'
+; YAML-NEXT:  - Cost:            '0'
 ; YAML-NEXT:  - String:          ' and with tree size '
-; YAML-NEXT:  - TreeSize:        '7'
+; YAML-NEXT:  - TreeSize:        '9'
 
 define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..9c1da08c64b7b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3ac..98333c7b420cf0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
 ; CHECK-LABEL: define <4 x i32> @test(
 ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22b..59b0352a825929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
   ; YAML-NEXT:   - Cost:            '-1'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '7'
+  ; YAML-NEXT:   - TreeSize:        '8'
 entry:
   %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
   %idx0 = load i32, ptr %off0.1, align 8

>From 651c220ab3e50fbb43568b8ae7c0c05dfd34a451 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 26 Sep 2024 18:27:26 +0000
Subject: [PATCH 2/2] Fix formatting

Created using spr 1.3.5
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 62c77704d92eb5..875e5156915dbd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6968,8 +6968,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
   // Try to vectorize postponed load entries, previously marked as gathered.
   for (unsigned Idx : LoadEntriesToVectorize) {
     const TreeEntry &E = *VectorizableTree[Idx];
-    SmallVector<Value *> GatheredScalars(E.Scalars.begin(),
-                                         E.Scalars.end());
+    SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
     // Avoid reordering, if possible.
     if (!E.ReorderIndices.empty()) {
       // Build a mask out of the reorder indices and reorder scalars per this



More information about the llvm-commits mailing list