[llvm] a65a5fe - [SLP]Improve masked loads vectorization, attempting gathered loads

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 8 13:43:15 PDT 2024


Author: Alexey Bataev
Date: 2024-10-08T16:43:10-04:00
New Revision: a65a5feb1a20581c85ee817dae8826f65fef62af

URL: https://github.com/llvm/llvm-project/commit/a65a5feb1a20581c85ee817dae8826f65fef62af
DIFF: https://github.com/llvm/llvm-project/commit/a65a5feb1a20581c85ee817dae8826f65fef62af.diff

LOG: [SLP]Improve masked loads vectorization, attempting gathered loads

If the vector of loads can be vectorized as masked gather and there are
several other masked gather nodes, compiler can try to attempt to check,
if it possible to gather such nodes into big consecutive/strided loads
  node, which provide better performance.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/110151

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7ced7a6d8eadbc..318a61208fe393 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1371,6 +1371,8 @@ class BoUpSLP {
     MustGather.clear();
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
+    LoadEntriesToVectorize.clear();
+    IsGraphTransformMode = false;
     GatheredLoadsEntriesFirst.reset();
     ExternalUses.clear();
     ExternalUsesAsOriginalScalar.clear();
@@ -3613,6 +3615,14 @@ class BoUpSLP {
       DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
   ValueToGatherNodesMap ValueToGatherNodes;
 
+  /// A list of the load entries (node indices), which can be vectorized using
+  /// strided or masked gather approach, but attempted to be represented as
+  /// contiguous loads.
+  SetVector<unsigned> LoadEntriesToVectorize;
+
+  /// true if graph nodes transforming mode is on.
+  bool IsGraphTransformMode = false;
+
   /// The index of the first gathered load entry in the VectorizeTree.
   std::optional<unsigned> GatheredLoadsEntriesFirst;
 
@@ -4618,17 +4628,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
   if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
     return false;
   auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
-  if (!GEP1)
-    return false;
   auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
-  if (!GEP2)
-    return false;
-  return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
-         ((isConstant(GEP1->getOperand(1)) &&
-           isConstant(GEP2->getOperand(1))) ||
+  return (!GEP1 || GEP1->getNumOperands() == 2) &&
+         (!GEP2 || GEP2->getNumOperands() == 2) &&
+         (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
+           (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
           !CompareOpcodes ||
-          getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
-              .getOpcode());
+          (GEP1 && GEP2 &&
+           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
+               .getOpcode()));
 }
 
 /// Calculates minimal alignment as a common alignment.
@@ -5118,9 +5126,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
                        return L->isLoopInvariant(V);
                      })) <= Sz / 2;
-  if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
+  if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
         auto *GEP = dyn_cast<GetElementPtrInst>(P);
-        return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
+        return (!GEP && doesNotNeedToBeScheduled(P)) ||
                (GEP && GEP->getNumOperands() == 2 &&
                 isa<Constant, Instruction>(GEP->getOperand(1)));
       })) {
@@ -6667,6 +6675,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
   GatheredLoadsEntriesFirst = VectorizableTree.size();
 
+  SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+      LoadEntriesToVectorize.size());
+  for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+    Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+               VectorizableTree[Idx]->Scalars.end());
+
   // Sort loads by distance.
   auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
                        const std::pair<LoadInst *, int> &L2) {
@@ -6924,8 +6938,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                   }
                 }
               }
+              // Cannot represent the loads as consecutive vectorizable nodes -
+              // just exit.
+              unsigned ConsecutiveNodesSize = 0;
+              if (!LoadEntriesToVectorize.empty() &&
+                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                         [&, Slice = Slice](const auto &P) {
+                           const auto *It = find_if(Slice, [&](Value *V) {
+                             return std::get<1>(P).contains(V);
+                           });
+                           if (It == Slice.end())
+                             return false;
+                           ArrayRef<Value *> VL =
+                               VectorizableTree[std::get<0>(P)]->Scalars;
+                           ConsecutiveNodesSize += VL.size();
+                           unsigned Start = std::distance(Slice.begin(), It);
+                           unsigned Sz = Slice.size() - Start;
+                           return Sz < VL.size() ||
+                                  Slice.slice(std::distance(Slice.begin(), It),
+                                              VL.size()) != VL;
+                         }))
+                continue;
               // Try to build long masked gather loads.
               UserMaxVF = bit_ceil(UserMaxVF);
+              if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+                         [&, Slice = Slice](unsigned Idx) {
+                           OrdersType Order;
+                           SmallVector<Value *> PointerOps;
+                           return canVectorizeLoads(
+                                      Slice.slice(Idx * UserMaxVF, UserMaxVF),
+                                      Slice[Idx * UserMaxVF], Order,
+                                      PointerOps) ==
+                                  LoadsState::ScatterVectorize;
+                         }))
+                UserMaxVF = MaxVF;
+              if (Slice.size() != ConsecutiveNodesSize)
+                MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
             }
             for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
               bool IsVectorized = true;
@@ -6934,6 +6982,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                     Slice.slice(I, std::min(VF, E - I));
                 if (getTreeEntry(SubSlice.front()))
                   continue;
+                // Check if the subslice is to be-vectorized entry, which is not
+                // equal to entry.
+                if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                           [&](const auto &P) {
+                             return !SubSlice.equals(
+                                        VectorizableTree[std::get<0>(P)]
+                                            ->Scalars) &&
+                                    set_is_subset(SubSlice, std::get<1>(P));
+                           }))
+                  continue;
                 unsigned Sz = VectorizableTree.size();
                 buildTree_rec(SubSlice, 0, EdgeInfo());
                 if (Sz == VectorizableTree.size()) {
@@ -6968,6 +7026,20 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     // Final attempt to vectorize non-vectorized loads.
     (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
   }
+  // Try to vectorize postponed load entries, previously marked as gathered.
+  for (unsigned Idx : LoadEntriesToVectorize) {
+    const TreeEntry &E = *VectorizableTree[Idx];
+    SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
+    // Avoid reordering, if possible.
+    if (!E.ReorderIndices.empty()) {
+      // Build a mask out of the reorder indices and reorder scalars per this
+      // mask.
+      SmallVector<int> ReorderMask;
+      inversePermutation(E.ReorderIndices, ReorderMask);
+      reorderScalars(GatheredScalars, ReorderMask);
+    }
+    buildTree_rec(GatheredScalars, 0, EdgeInfo());
+  }
   // If no new entries created, consider it as no gathered loads entries must be
   // handled.
   if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
@@ -7280,6 +7352,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
+      if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+        // Delay slow vectorized nodes for better vectorization attempts.
+        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        return TreeEntry::NeedToGather;
+      }
       return TreeEntry::ScatterVectorize;
     case LoadsState::StridedVectorize:
       return TreeEntry::StridedVectorize;
@@ -9117,6 +9194,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
+  // Turn graph transforming mode on and off, when done.
+  class GraphTransformModeRAAI {
+    bool &SavedIsGraphTransformMode;
+
+  public:
+    GraphTransformModeRAAI(bool &IsGraphTransformMode)
+        : SavedIsGraphTransformMode(IsGraphTransformMode) {
+      IsGraphTransformMode = true;
+    }
+    ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+  } TransformContext(IsGraphTransformMode);
   // Operands are profitable if they are:
   // 1. At least one constant
   // or
@@ -9149,7 +9237,7 @@ void BoUpSLP::transformNodes() {
       unsigned MinVF = getMinVF(2 * Sz);
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
-      if (VL.size() <= 2 ||
+      if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
           !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
@@ -9248,6 +9336,8 @@ void BoUpSLP::transformNodes() {
             continue;
           }
           unsigned PrevSize = VectorizableTree.size();
+          [[maybe_unused]] unsigned PrevEntriesSize =
+              LoadEntriesToVectorize.size();
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
           if (PrevSize + 1 == VectorizableTree.size() &&
               VectorizableTree[PrevSize]->isGather() &&
@@ -9255,6 +9345,8 @@ void BoUpSLP::transformNodes() {
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
             VectorizableTree.pop_back();
+            assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                   "LoadEntriesToVectorize expected to remain the same");
             continue;
           }
           AddCombinedNode(PrevSize, Cnt);
@@ -9340,17 +9432,19 @@ void BoUpSLP::transformNodes() {
     }
   }
 
-  // Single load node - exit.
-  if (VectorizableTree.size() <= 1 &&
-      VectorizableTree.front()->getOpcode() == Instruction::Load)
-    return;
-  // Small graph with small VF - exit.
-  constexpr unsigned SmallTree = 3;
-  constexpr unsigned SmallVF = 2;
-  if ((VectorizableTree.size() <= SmallTree &&
-       VectorizableTree.front()->Scalars.size() == SmallVF) ||
-      (VectorizableTree.size() <= 2 && UserIgnoreList))
-    return;
+  if (LoadEntriesToVectorize.empty()) {
+    // Single load node - exit.
+    if (VectorizableTree.size() <= 1 &&
+        VectorizableTree.front()->getOpcode() == Instruction::Load)
+      return;
+    // Small graph with small VF - exit.
+    constexpr unsigned SmallTree = 3;
+    constexpr unsigned SmallVF = 2;
+    if ((VectorizableTree.size() <= SmallTree &&
+         VectorizableTree.front()->Scalars.size() == SmallVF) ||
+        (VectorizableTree.size() <= 2 && UserIgnoreList))
+      return;
+  }
 
   // A list of loads to be gathered during the vectorization process. We can
   // try to vectorize them at the end, if profitable.

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index b38c636ccaf5da..443f17a9c09e7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -58,83 +58,80 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
 ; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
-; CHECK-NEXT:    [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]]
+; CHECK-NEXT:    [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT:    [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
-; CHECK-NEXT:    [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
-; CHECK-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
+; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
+; CHECK-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
 ; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
 ; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
 ; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP52]] to i32
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
-; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
+; CHECK-NEXT:    [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT:    [[TMP64:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
+; CHECK-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
+; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT:    [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
-; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
-; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
+; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
+; CHECK-NEXT:    [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
 ; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
 ; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
@@ -151,39 +148,39 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[ADD94_6:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP107]], [[TMP68]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
+; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
-; CHECK-NEXT:    [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP131]], [[TMP100]]
+; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT:    [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP97]], [[TMP121]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP133]]
+; CHECK-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP112]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP107]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
 ; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
@@ -195,22 +192,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP145:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP145]], [[TMP120]]
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP123]], [[TMP134]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP157]]
+; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP133]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
 ; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP158:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP158]]
+; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP191]]
 ; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -224,34 +221,34 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
-; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
-; CHECK-NEXT:    [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP146]], [[TMP98]]
-; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
-; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
-; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP167:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP166]], [[TMP167]]
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT:    [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP141]], [[TMP193]]
+; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP192]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP145]], [[TMP144]]
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP145]], 15
+; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
+; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP147]], [[TMP146]]
+; CHECK-NEXT:    [[TMP148:%.*]] = sub <2 x i32> [[TMP192]], [[TMP143]]
 ; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; CHECK-NEXT:    [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
-; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
-; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP152]], [[TMP154]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
+; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <2 x i32> [[TMP149]], i32 [[SUB45_1]], i32 0
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT:    [[TMP152:%.*]] = sub <2 x i32> [[TMP150]], [[TMP151]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP147]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[SHR_I54_2:%.*]] = lshr i32 [[SUB47_1]], 15
-; CHECK-NEXT:    [[AND_I55_2:%.*]] = and i32 [[SHR_I54_2]], 65537
-; CHECK-NEXT:    [[MUL_I56_2:%.*]] = mul i32 [[AND_I55_2]], 65535
-; CHECK-NEXT:    [[TMP147:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
+; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
+; CHECK-NEXT:    [[TMP194:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP154:%.*]] = and <2 x i32> [[TMP194]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP154]], <i32 65535, i32 65535>
 ; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
 ; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
@@ -263,9 +260,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
 ; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP166]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP146]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP147]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP145]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
@@ -275,11 +272,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
 ; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
 ; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[TMP203:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
-; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[TMP203]], [[TMP83]]
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
 ; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
 ; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
@@ -287,69 +284,69 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP169]], [[TMP160]]
-; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
-; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
-; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
-; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
-; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
+; CHECK-NEXT:    [[TMP196:%.*]] = extractelement <2 x i32> [[TMP148]], i32 0
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP148]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP196]], [[TMP157]]
+; CHECK-NEXT:    [[TMP158:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB51_2]], i32 0
+; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP148]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <2 x i32> [[TMP159]], i32 [[SUB51_3]], i32 0
+; CHECK-NEXT:    [[TMP161:%.*]] = sub <2 x i32> [[TMP158]], [[TMP160]]
+; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP161]], i32 0
+; CHECK-NEXT:    [[TMP170:%.*]] = extractelement <2 x i32> [[TMP161]], i32 1
+; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP207:%.*]] = add <2 x i32> [[TMP149]], [[TMP227]]
-; CHECK-NEXT:    [[TMP213:%.*]] = xor <2 x i32> [[TMP207]], [[TMP113]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP98]], 15
+; CHECK-NEXT:    [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
+; CHECK-NEXT:    [[TMP172:%.*]] = xor <2 x i32> [[TMP197]], [[TMP113]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP144]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
 ; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP144]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP176]]
-; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP177]]
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP172]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP173]]
+; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP172]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP174]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
-; CHECK-NEXT:    [[SUB59:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
-; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
-; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
-; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0
-; CHECK-NEXT:    [[TMP242:%.*]] = shufflevector <2 x i32> [[TMP241]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]]
-; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]]
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP175]], [[TMP176]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP175]]
+; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
 ; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
 ; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
 ; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
-; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP150:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
-; CHECK-NEXT:    [[TMP234:%.*]] = xor <2 x i32> [[TMP150]], [[TMP102]]
+; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]]
+; CHECK-NEXT:    [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP192]]
-; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP193]]
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
@@ -361,9 +358,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; THR15-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; THR15-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; THR15-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
+; THR15-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
+; THR15-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
+; THR15-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
 ; THR15-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
 ; THR15-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
 ; THR15-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
@@ -374,8 +371,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; THR15-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; THR15-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; THR15-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
+; THR15-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
 ; THR15-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6
 ; THR15-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
 ; THR15-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6
@@ -384,17 +381,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1
 ; THR15-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0
 ; THR15-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
-; THR15-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
+; THR15-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
 ; THR15-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3
-; THR15-NEXT:    [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_2]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i16>
 ; THR15-NEXT:    [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
-; THR15-NEXT:    [[TMP40:%.*]] = sub <2 x i16> [[TMP11]], [[TMP20]]
-; THR15-NEXT:    [[TMP46:%.*]] = extractelement <2 x i16> [[TMP40]], i32 1
-; THR15-NEXT:    [[TMP16:%.*]] = sext i16 [[TMP46]] to i32
+; THR15-NEXT:    [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
+; THR15-NEXT:    [[TMP14:%.*]] = sub <2 x i16> [[TMP11]], [[TMP13]]
+; THR15-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP14]], i32 1
+; THR15-NEXT:    [[TMP16:%.*]] = sext i16 [[TMP15]] to i32
 ; THR15-NEXT:    [[SHL42_1:%.*]] = shl i32 [[TMP16]], 16
-; THR15-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP40]], i32 0
+; THR15-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP14]], i32 0
 ; THR15-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
 ; THR15-NEXT:    [[ADD43_1:%.*]] = add i32 [[SHL42_1]], [[TMP18]]
 ; THR15-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
@@ -402,94 +399,93 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; THR15-NEXT:    [[TMP19:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; THR15-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
 ; THR15-NEXT:    [[TMP21:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; THR15-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; THR15-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP66]], [[TMP22]]
+; THR15-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]]
 ; THR15-NEXT:    [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; THR15-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
-; THR15-NEXT:    [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
-; THR15-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
+; THR15-NEXT:    [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
+; THR15-NEXT:    [[TMP26:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
+; THR15-NEXT:    [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]]
+; THR15-NEXT:    [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]]
 ; THR15-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
 ; THR15-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
 ; THR15-NEXT:    [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
+; THR15-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
 ; THR15-NEXT:    [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
+; THR15-NEXT:    [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
 ; THR15-NEXT:    [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT:    [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
 ; THR15-NEXT:    [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
 ; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP53]], [[TMP39]]
-; THR15-NEXT:    [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT:    [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; THR15-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP59]], [[TMP68]]
-; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP59]]
-; THR15-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
-; THR15-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP60]], [[TMP76]]
-; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP76]], [[TMP60]]
+; THR15-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]]
+; THR15-NEXT:    [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]]
+; THR15-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
+; THR15-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
+; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]]
+; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP43]], [[TMP44]]
+; THR15-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0
+; THR15-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1
+; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]]
+; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP46]]
 ; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
 ; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT:    [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
 ; THR15-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT:    [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
+; THR15-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; THR15-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; THR15-NEXT:    [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP48:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; THR15-NEXT:    [[TMP63:%.*]] = zext i8 [[TMP48]] to i32
-; THR15-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP49]], [[TMP61]]
-; THR15-NEXT:    [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT:    [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
-; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP54]], [[TMP80]]
-; THR15-NEXT:    [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
-; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; THR15-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; THR15-NEXT:    [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT:    [[TMP83:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; THR15-NEXT:    [[TMP87:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP87]] to <2 x i32>
-; THR15-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP83]], [[TMP98]]
-; THR15-NEXT:    [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; THR15-NEXT:    [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT:    [[TMP70:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
-; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; THR15-NEXT:    [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
-; THR15-NEXT:    [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT:    [[TMP102:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP102]], [[TMP104]]
-; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP102]]
-; THR15-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP78]], [[TMP107]]
-; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP107]], [[TMP78]]
-; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT:    [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT:    [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; THR15-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; THR15-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
+; THR15-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; THR15-NEXT:    [[TMP55:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT:    [[TMP57:%.*]] = sub <2 x i32> [[TMP54]], [[TMP56]]
+; THR15-NEXT:    [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]]
+; THR15-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; THR15-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; THR15-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; THR15-NEXT:    [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; THR15-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
+; THR15-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
+; THR15-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP59]]
+; THR15-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP59]], [[TMP71]]
+; THR15-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; THR15-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
+; THR15-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[ADD44_2]], i32 1
+; THR15-NEXT:    [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[ADD46_2]], i32 1
+; THR15-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP76]], [[TMP78]]
+; THR15-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0
+; THR15-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1
+; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
+; THR15-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP73]], i32 [[SUB45_2]], i32 1
+; THR15-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> [[TMP83]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
 ; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15
+; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15
 ; THR15-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; THR15-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
 ; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
@@ -497,113 +493,117 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
 ; THR15-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
 ; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
-; THR15-NEXT:    [[TMP105:%.*]] = extractelement <2 x i32> [[TMP66]], i32 1
-; THR15-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP105]], 15
+; THR15-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1
+; THR15-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15
 ; THR15-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; THR15-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; THR15-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[TMP66]], i32 0
-; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
+; THR15-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
+; THR15-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15
+; THR15-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; THR15-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; THR15-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; THR15-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; THR15-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; THR15-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
-; THR15-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
+; THR15-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; THR15-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; THR15-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
 ; THR15-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; THR15-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
-; THR15-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
-; THR15-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
-; THR15-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; THR15-NEXT:    [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
-; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; THR15-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; THR15-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
-; THR15-NEXT:    [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP125:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; THR15-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
-; THR15-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP96:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
-; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP125]]
-; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
-; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP108]]
-; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
-; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
-; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
-; THR15-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
-; THR15-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0
-; THR15-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
-; THR15-NEXT:    [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
-; THR15-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT:    [[TMP110:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
-; THR15-NEXT:    [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP110]]
-; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
+; THR15-NEXT:    [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT:    [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32>
+; THR15-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT:    [[TMP96:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; THR15-NEXT:    [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]]
+; THR15-NEXT:    [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP102:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
+; THR15-NEXT:    [[TMP104:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
+; THR15-NEXT:    [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; THR15-NEXT:    [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]]
+; THR15-NEXT:    [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1
+; THR15-NEXT:    [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]]
+; THR15-NEXT:    [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]]
+; THR15-NEXT:    [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0
+; THR15-NEXT:    [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]]
+; THR15-NEXT:    [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]]
+; THR15-NEXT:    [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]]
+; THR15-NEXT:    [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]]
+; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0
+; THR15-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1
+; THR15-NEXT:    [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]]
+; THR15-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]]
+; THR15-NEXT:    [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0
+; THR15-NEXT:    [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1
+; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]]
+; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15
 ; THR15-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; THR15-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
-; THR15-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
+; THR15-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15
 ; THR15-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; THR15-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT:    [[TMP112:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; THR15-NEXT:    [[TMP129:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
-; THR15-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; THR15-NEXT:    [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
-; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP126]], [[TMP115]]
-; THR15-NEXT:    [[TMP121:%.*]] = add <2 x i32> [[TMP114]], [[TMP134]]
-; THR15-NEXT:    [[TMP145:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP145]], i8 [[TMP3]], i32 1
+; THR15-NEXT:    [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT:    [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32>
+; THR15-NEXT:    [[TMP125:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
+; THR15-NEXT:    [[TMP127:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; THR15-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; THR15-NEXT:    [[TMP146:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP9]], i32 0
-; THR15-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
-; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; THR15-NEXT:    [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
-; THR15-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP132]]
-; THR15-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
-; THR15-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP147]]
-; THR15-NEXT:    [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
-; THR15-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
-; THR15-NEXT:    [[TMP138:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
+; THR15-NEXT:    [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT:    [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]]
+; THR15-NEXT:    [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP133:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP134:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; THR15-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP126]]
+; THR15-NEXT:    [[TMP136:%.*]] = add <2 x i32> [[TMP132]], [[TMP135]]
+; THR15-NEXT:    [[TMP137:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP138:%.*]] = insertelement <2 x i8> [[TMP137]], i8 [[TMP3]], i32 1
+; THR15-NEXT:    [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32>
+; THR15-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[TMP9]], i32 0
+; THR15-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]]
+; THR15-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; THR15-NEXT:    [[SHL30_1:%.*]] = shl i32 [[TMP142]], 16
+; THR15-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP143]]
+; THR15-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0
+; THR15-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1
+; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]]
+; THR15-NEXT:    [[TMP146:%.*]] = shufflevector <2 x i32> [[TMP136]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; THR15-NEXT:    [[TMP147:%.*]] = insertelement <2 x i32> [[TMP146]], i32 [[ADD43_1]], i32 1
+; THR15-NEXT:    [[TMP148:%.*]] = insertelement <2 x i32> [[TMP136]], i32 [[ADD31_1]], i32 1
+; THR15-NEXT:    [[TMP149:%.*]] = add <2 x i32> [[TMP147]], [[TMP148]]
 ; THR15-NEXT:    [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; THR15-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP138]], i32 0
-; THR15-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP138]], i32 1
-; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP140]], [[TMP139]]
-; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
+; THR15-NEXT:    [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
+; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]]
+; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]]
 ; THR15-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; THR15-NEXT:    [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP91]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT:    [[TMP142:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB45_1]], i32 0
-; THR15-NEXT:    [[TMP143:%.*]] = insertelement <2 x i32> [[TMP91]], i32 [[SUB47_1]], i32 0
-; THR15-NEXT:    [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP143]]
-; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP140]], 15
+; THR15-NEXT:    [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP153:%.*]] = insertelement <2 x i32> [[TMP152]], i32 [[SUB45_1]], i32 0
+; THR15-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP118]], i32 [[SUB47_1]], i32 0
+; THR15-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP153]], [[TMP154]]
+; THR15-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP151]], 15
+; THR15-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
+; THR15-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
+; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
 ; THR15-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; THR15-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; THR15-NEXT:    [[SHR_I54_2:%.*]] = lshr i32 [[SUB47_1]], 15
-; THR15-NEXT:    [[AND_I55_2:%.*]] = and i32 [[SHR_I54_2]], 65537
-; THR15-NEXT:    [[MUL_I56_2:%.*]] = mul i32 [[AND_I55_2]], 65535
-; THR15-NEXT:    [[TMP122:%.*]] = lshr <2 x i32> [[TMP130]], <i32 15, i32 15>
-; THR15-NEXT:    [[TMP123:%.*]] = and <2 x i32> [[TMP122]], <i32 65537, i32 65537>
-; THR15-NEXT:    [[TMP124:%.*]] = mul <2 x i32> [[TMP123]], <i32 65535, i32 65535>
+; THR15-NEXT:    [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], <i32 15, i32 15>
+; THR15-NEXT:    [[TMP157:%.*]] = and <2 x i32> [[TMP156]], <i32 65537, i32 65537>
+; THR15-NEXT:    [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], <i32 65535, i32 65535>
 ; THR15-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
 ; THR15-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
 ; THR15-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
@@ -611,13 +611,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
 ; THR15-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
 ; THR15-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; THR15-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
+; THR15-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]]
 ; THR15-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
 ; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
-; THR15-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
-; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP140]]
+; THR15-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
+; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP151]]
 ; THR15-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; THR15-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP90]]
+; THR15-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]]
 ; THR15-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; THR15-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; THR15-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
@@ -628,73 +628,73 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
 ; THR15-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
 ; THR15-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
-; THR15-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP105]]
-; THR15-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_1]]
-; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP64]]
-; THR15-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
+; THR15-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]]
+; THR15-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]]
+; THR15-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
 ; THR15-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
 ; THR15-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
-; THR15-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
+; THR15-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]]
 ; THR15-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
 ; THR15-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; THR15-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; THR15-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
 ; THR15-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
 ; THR15-NEXT:    [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]]
-; THR15-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; THR15-NEXT:    [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; THR15-NEXT:    [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP154]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP153]], [[TMP155]]
-; THR15-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP153]], [[TMP155]]
-; THR15-NEXT:    [[TMP158:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP157]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT:    [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; THR15-NEXT:    [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; THR15-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> <i32 0, i32 3>
 ; THR15-NEXT:    [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]]
 ; THR15-NEXT:    [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]]
-; THR15-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_2]]
+; THR15-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
 ; THR15-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; THR15-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP124]], [[TMP158]]
-; THR15-NEXT:    [[TMP160:%.*]] = xor <2 x i32> [[TMP177]], [[TMP130]]
-; THR15-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP89]], 15
+; THR15-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]]
+; THR15-NEXT:    [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]]
+; THR15-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15
 ; THR15-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
 ; THR15-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
 ; THR15-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; THR15-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
+; THR15-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]]
 ; THR15-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
-; THR15-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
-; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
+; THR15-NEXT:    [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0
+; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]]
+; THR15-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1
+; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]]
 ; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
-; THR15-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP176]]
-; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP159]]
-; THR15-NEXT:    [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
-; THR15-NEXT:    [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT:    [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
-; THR15-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT:    [[TMP167:%.*]] = add <2 x i32> [[TMP164]], [[TMP166]]
-; THR15-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP166]]
-; THR15-NEXT:    [[TMP169:%.*]] = shufflevector <2 x i32> [[TMP167]], <2 x i32> [[TMP168]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT:    [[TMP170:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
+; THR15-NEXT:    [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
+; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP170]], [[TMP171]]
+; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP171]], [[TMP170]]
+; THR15-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; THR15-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; THR15-NEXT:    [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
 ; THR15-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
 ; THR15-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
-; THR15-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
+; THR15-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
 ; THR15-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
-; THR15-NEXT:    [[TMP170:%.*]] = lshr <2 x i32> [[TMP74]], <i32 15, i32 15>
-; THR15-NEXT:    [[TMP171:%.*]] = and <2 x i32> [[TMP170]], <i32 65537, i32 65537>
-; THR15-NEXT:    [[TMP172:%.*]] = mul <2 x i32> [[TMP171]], <i32 65535, i32 65535>
-; THR15-NEXT:    [[TMP173:%.*]] = add <2 x i32> [[TMP172]], [[TMP169]]
-; THR15-NEXT:    [[TMP174:%.*]] = xor <2 x i32> [[TMP173]], [[TMP74]]
+; THR15-NEXT:    [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], <i32 15, i32 15>
+; THR15-NEXT:    [[TMP180:%.*]] = and <2 x i32> [[TMP179]], <i32 65537, i32 65537>
+; THR15-NEXT:    [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], <i32 65535, i32 65535>
+; THR15-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]]
+; THR15-NEXT:    [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]]
 ; THR15-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; THR15-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; THR15-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; THR15-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; THR15-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; THR15-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
-; THR15-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
-; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP178]]
+; THR15-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0
+; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
+; THR15-NEXT:    [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1
+; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]]
 ; THR15-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; THR15-NEXT:    ret i32 [[ADD113_3]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bb806be15c71ca..09612444afd205 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -8,20 +8,17 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:  - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:  - Cost:            '2'
+; YAML-NEXT:  - Cost:            '0'
 ; YAML-NEXT:  - String:          ' and with tree size '
-; YAML-NEXT:  - TreeSize:        '7'
+; YAML-NEXT:  - TreeSize:        '9'
 
 define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 poison>

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e45..9c1da08c64b7b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3ac..98333c7b420cf0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
 ; CHECK-LABEL: define <4 x i32> @test(
 ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP5]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22b..59b0352a825929 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
   ; YAML-NEXT:   - Cost:            '-1'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '7'
+  ; YAML-NEXT:   - TreeSize:        '8'
 entry:
   %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
   %idx0 = load i32, ptr %off0.1, align 8


        


More information about the llvm-commits mailing list