[llvm] [SLP]Initial support for interleaved loads (PR #112042)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 11 12:55:27 PDT 2024


https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/112042

Adds initial support for interleaved loads, which allows
emission of segmented loads for RISCV RVV.

Vectorizes extra code for RISCV
CFP2006/447.dealII, CFP2006/453.povray,
CFP2017rate/510.parest_r, CFP2017rate/511.povray_r,
CFP2017rate/526.blender_r, CFP2017rate/538.imagick_r, CINT2006/403.gcc,
CINT2006/473.astar, CINT2017rate/502.gcc_r, CINT2017rate/525.x264_r


>From 78074c68393d253155a0a486cc8a79d8f530b85f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 11 Oct 2024 19:55:13 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../llvm/Analysis/TargetTransformInfo.h       |  15 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   5 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   7 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   6 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 119 ++++-
 .../SLPVectorizer/RISCV/complex-loads.ll      | 473 ++++++++----------
 .../SLPVectorizer/RISCV/segmented-loads.ll    |   5 +-
 7 files changed, 361 insertions(+), 269 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 64dc9aacd5c57b..0459941fe05cdc 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -803,6 +803,12 @@ class TargetTransformInfo {
   /// Return true if the target supports strided load.
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
 
+  /// Return true is the target supports interleaved access for the given vector
+  /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
+  /// address space \p AddrSpace.
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) const;
+
   // Return true if the target supports masked vector histograms.
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const;
 
@@ -1934,6 +1940,10 @@ class TargetTransformInfo::Concept {
   virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                            Align Alignment,
+                                            unsigned AddrSpace) = 0;
+
   virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0;
   virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
                                unsigned Opcode1,
@@ -2456,6 +2466,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalStridedLoadStore(DataType, Alignment);
   }
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment,
+                                    unsigned AddrSpace) override {
+    return Impl.isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace);
+  }
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override {
     return Impl.isLegalMaskedVectorHistogram(AddrType, DataType);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1c4fcb57755ecf..dbdfb4d8cdfa32 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -332,6 +332,11 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) {
+    return false;
+  }
+
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const {
     return false;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8ab8a53b753112..1bad3314677038 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -521,6 +521,13 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
   return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalInterleavedAccessType(
+    VectorType *VTy, unsigned Factor, Align Alignment,
+    unsigned AddrSpace) const {
+  return TTIImpl->isLegalInterleavedAccessType(VTy, Factor, Alignment,
+                                               AddrSpace);
+}
+
 bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType,
                                                        Type *DataType) const {
   return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 3f50bd86b9b3b6..13d28e4db49cd9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -295,6 +295,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
   }
 
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) {
+    return TLI->isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace,
+                                             DL);
+  }
+
   bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
 
   bool isVScaleKnownToBeAPowerOfTwo() const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c164075e83259..1e8939988037d6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2922,7 +2922,7 @@ class BoUpSLP {
 
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
-                     const EdgeInfo &EI);
+                     const EdgeInfo &EI, unsigned InterleaveFactor = 0);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -3226,7 +3226,15 @@ class BoUpSLP {
     Instruction *MainOp = nullptr;
     Instruction *AltOp = nullptr;
 
+    /// Interleaving factor for interleaved loads Vectorize nodes.
+    unsigned InterleaveFactor = 0;
+
   public:
+    /// Returns interleave factor for interleave nodes.
+    unsigned getInterleaveFactor() const { return InterleaveFactor; }
+    /// Sets interleaving factor for the interleaving nodes.
+    void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
+
     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
     void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
       if (Operands.size() < OpIdx + 1)
@@ -3390,7 +3398,12 @@ class BoUpSLP {
       dbgs() << "State: ";
       switch (State) {
       case Vectorize:
-        dbgs() << "Vectorize\n";
+        if (InterleaveFactor > 0) {
+          dbgs() << "Vectorize with interleave factor " << InterleaveFactor
+                 << "\n";
+        } else {
+          dbgs() << "Vectorize\n";
+        }
         break;
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
@@ -3460,11 +3473,15 @@ class BoUpSLP {
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = {},
-                          ArrayRef<unsigned> ReorderIndices = {}) {
+                          ArrayRef<unsigned> ReorderIndices = {},
+                          unsigned InterleaveFactor = 0) {
     TreeEntry::EntryState EntryState =
         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
-    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
-                        ReuseShuffleIndices, ReorderIndices);
+    TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+                                ReuseShuffleIndices, ReorderIndices);
+    if (E && InterleaveFactor > 0)
+      E->setInterleave(InterleaveFactor);
+    return E;
   }
 
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
@@ -6932,11 +6949,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
             // distance between scalar loads in these nodes.
             unsigned MaxVF = Slice.size();
             unsigned UserMaxVF = 0;
+            unsigned InterleaveFactor = 0;
             if (MaxVF == 2) {
               UserMaxVF = MaxVF;
             } else {
+              // Found distance between segments of the interleaved loads.
+              std::optional<unsigned> InterleavedLoadsDistance = 0;
+              unsigned Order = 0;
               std::optional<unsigned> CommonVF = 0;
               DenseMap<const TreeEntry *, unsigned> EntryToPosition;
+              SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
               for (auto [Idx, V] : enumerate(Slice)) {
                 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
                   UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
@@ -6951,12 +6973,60 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                     if (*CommonVF != E->Scalars.size())
                       CommonVF.reset();
                   }
+                  // Check if the load is the part of the interleaved load.
+                  if (Pos != Idx && InterleavedLoadsDistance) {
+                    if (!DeinterleavedNodes.contains(E) &&
+                        any_of(E->Scalars, [&, Slice = Slice](Value *V) {
+                          if (isa<Constant>(V))
+                            return false;
+                          if (getTreeEntry(V))
+                            return true;
+                          const auto &Nodes = ValueToGatherNodes.at(V);
+                          return (Nodes.size() != 1 || !Nodes.contains(E)) &&
+                                 !is_contained(Slice, V);
+                        })) {
+                      InterleavedLoadsDistance.reset();
+                      continue;
+                    }
+                    DeinterleavedNodes.insert(E);
+                    if (*InterleavedLoadsDistance == 0) {
+                      InterleavedLoadsDistance = Idx - Pos;
+                      continue;
+                    }
+                    if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
+                        (Idx - Pos) / *InterleavedLoadsDistance < Order)
+                      InterleavedLoadsDistance.reset();
+                    Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
+                  }
+                }
+              }
+              DeinterleavedNodes.clear();
+              // Check if the large load represents interleaved load operation.
+              if (InterleavedLoadsDistance.value_or(0) > 1 &&
+                  CommonVF.value_or(0) != 0) {
+                InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
+                unsigned VF = *CommonVF;
+                OrdersType Order;
+                SmallVector<Value *> PointerOps;
+                // Segmented load detected - vectorize at maximum vector factor.
+                if (TTI->isLegalInterleavedAccessType(
+                        getWidenedType(Slice.front()->getType(), VF),
+                        InterleaveFactor,
+                        cast<LoadInst>(Slice.front())->getAlign(),
+                        cast<LoadInst>(Slice.front())
+                            ->getPointerAddressSpace()) &&
+                    canVectorizeLoads(Slice, Slice.front(), Order,
+                                      PointerOps) == LoadsState::Vectorize) {
+                  UserMaxVF = InterleaveFactor * VF;
+                } else {
+                  InterleaveFactor = 0;
                 }
               }
               // Cannot represent the loads as consecutive vectorizable nodes -
               // just exit.
               unsigned ConsecutiveNodesSize = 0;
               if (!LoadEntriesToVectorize.empty() &&
+                  InterleaveFactor == 0 &&
                   any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
                          [&, Slice = Slice](const auto &P) {
                            const auto *It = find_if(Slice, [&](Value *V) {
@@ -6976,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 continue;
               // Try to build long masked gather loads.
               UserMaxVF = bit_ceil(UserMaxVF);
-              if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+              if (InterleaveFactor == 0 &&
+                  any_of(seq<unsigned>(Slice.size() / UserMaxVF),
                          [&, Slice = Slice](unsigned Idx) {
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
@@ -7008,9 +7079,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                            }))
                   continue;
                 unsigned Sz = VectorizableTree.size();
-                buildTree_rec(SubSlice, 0, EdgeInfo());
+                buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
                 if (Sz == VectorizableTree.size()) {
                   IsVectorized = false;
+                  // Try non-interleaved vectorization with smaller vector
+                  // factor.
+                  if (InterleaveFactor > 0) {
+                    VF = 2 * (MaxVF / InterleaveFactor);
+                    InterleaveFactor = 0;
+                  }
                   continue;
                 }
               }
@@ -7374,6 +7451,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       }
       return TreeEntry::ScatterVectorize;
     case LoadsState::StridedVectorize:
+      if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
+        // Delay slow vectorized nodes for better vectorization attempts.
+        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        return TreeEntry::NeedToGather;
+      }
       return TreeEntry::StridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
@@ -7707,7 +7789,8 @@ class PHIHandler {
 } // namespace
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            const EdgeInfo &UserTreeIdx) {
+                            const EdgeInfo &UserTreeIdx,
+                            unsigned InterleaveFactor) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   SmallVector<int> ReuseShuffleIndices;
@@ -8185,7 +8268,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       switch (State) {
       case TreeEntry::Vectorize:
         TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                          ReuseShuffleIndices, CurrentOrder);
+                          ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
         if (CurrentOrder.empty())
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
@@ -9895,6 +9978,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             Idx = EMask[Idx];
         }
         CommonVF = E->Scalars.size();
+      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
+                 Factor && E->Scalars.size() != Mask.size() &&
+                 ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
+                                                               *Factor)) {
+        // Deinterleaved nodes are free.
+        std::iota(CommonMask.begin(), CommonMask.end(), 0);
       }
       ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
@@ -10968,10 +11057,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto *LI0 = cast<LoadInst>(VL0);
     auto GetVectorCost = [&](InstructionCost CommonCost) {
       InstructionCost VecLdCost;
-      if (E->State == TreeEntry::Vectorize) {
+      if (E->State == TreeEntry::Vectorize && !E->getInterleaveFactor()) {
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
+                 E->State == TreeEntry::Vectorize && Factor.value_or(0) > 0) {
+        VecLdCost = TTI->getInterleavedMemoryOpCost(
+            Instruction::Load, VecTy, *Factor, std::nullopt, LI0->getAlign(),
+            LI0->getPointerAddressSpace(), CostKind);
       } else if (E->State == TreeEntry::StridedVectorize) {
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
@@ -11397,6 +11491,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       }))
     return false;
 
+  if (VectorizableTree.back()->isGather() &&
+      VectorizableTree.back()->isAltShuffle() &&
+      VectorizableTree.back()->getVectorFactor() > 2)
+    return false;
+
   assert(VectorizableTree.empty()
              ? ExternalUses.empty()
              : true && "We shouldn't have any external users");
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 443f17a9c09e7a..8c6b92b65ae050 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -11,9 +11,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
 ; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
@@ -24,59 +21,42 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
-; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3
-; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_2]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
-; CHECK-NEXT:    [[TMP28:%.*]] = sub <2 x i16> [[TMP20]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP28]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = sext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP28]], i32 0
-; CHECK-NEXT:    [[CONV33_1:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
-; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]]
-; CHECK-NEXT:    [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
-; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
@@ -86,44 +66,40 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
 ; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
-; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
 ; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT:    [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP52]] to i32
-; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]]
+; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
-; CHECK-NEXT:    [[TMP64:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
+; CHECK-NEXT:    [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
 ; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
-; CHECK-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
 ; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
 ; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
-; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]]
-; CHECK-NEXT:    [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]]
+; CHECK-NEXT:    [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
 ; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
@@ -132,46 +108,64 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
 ; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
+; CHECK-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
+; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0
+; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]]
+; CHECK-NEXT:    [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]]
+; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0
+; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0
+; CHECK-NEXT:    [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]]
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
 ; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]]
 ; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
 ; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
 ; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]]
+; CHECK-NEXT:    [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]]
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
@@ -182,75 +176,70 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
+; CHECK-NEXT:    [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
+; CHECK-NEXT:    [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
 ; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
 ; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
 ; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
+; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]]
+; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]]
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]]
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT:    [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15
+; CHECK-NEXT:    [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
+; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
+; CHECK-NEXT:    [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP133]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]]
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP191]]
-; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
+; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP155:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]]
+; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]]
+; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]]
+; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]]
+; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]]
+; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]]
 ; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
 ; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]]
 ; CHECK-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]]
 ; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
-; CHECK-NEXT:    [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
-; CHECK-NEXT:    [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
-; CHECK-NEXT:    [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
-; CHECK-NEXT:    [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
-; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP141]], [[TMP193]]
-; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP192]], i32 1
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
-; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP145]], [[TMP144]]
-; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP145]], 15
-; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
-; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
-; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP147]], [[TMP146]]
-; CHECK-NEXT:    [[TMP148:%.*]] = sub <2 x i32> [[TMP192]], [[TMP143]]
-; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <2 x i32> [[TMP149]], i32 [[SUB45_1]], i32 0
-; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
-; CHECK-NEXT:    [[TMP152:%.*]] = sub <2 x i32> [[TMP150]], [[TMP151]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP147]], 15
+; CHECK-NEXT:    [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
 ; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP194:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP194:%.*]] = lshr <2 x i32> [[TMP110]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP154:%.*]] = and <2 x i32> [[TMP194]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP154]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
-; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
 ; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
@@ -260,37 +249,32 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
 ; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP147]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP145]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
-; CHECK-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
 ; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
+; CHECK-NEXT:    [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]]
+; CHECK-NEXT:    [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
 ; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
 ; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]]
 ; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
-; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]]
 ; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP196:%.*]] = extractelement <2 x i32> [[TMP148]], i32 0
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP148]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP196]], [[TMP157]]
-; CHECK-NEXT:    [[TMP158:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB51_2]], i32 0
-; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP148]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <2 x i32> [[TMP159]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP161:%.*]] = sub <2 x i32> [[TMP158]], [[TMP160]]
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
+; CHECK-NEXT:    [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
 ; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
 ; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
@@ -298,29 +282,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
 ; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
 ; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP161]], i32 0
-; CHECK-NEXT:    [[TMP170:%.*]] = extractelement <2 x i32> [[TMP161]], i32 1
 ; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
 ; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
 ; CHECK-NEXT:    [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
-; CHECK-NEXT:    [[TMP172:%.*]] = xor <2 x i32> [[TMP197]], [[TMP113]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP144]], 15
+; CHECK-NEXT:    [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
 ; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP144]]
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP172]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP173]]
-; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP172]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP174]]
-; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
 ; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]]
 ; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP175]], [[TMP176]]
-; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP175]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
 ; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -358,9 +338,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; THR15-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; THR15-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; THR15-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
 ; THR15-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
 ; THR15-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
 ; THR15-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
@@ -371,133 +348,116 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; THR15-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; THR15-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
-; THR15-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6
-; THR15-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
-; THR15-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6
+; THR15-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
 ; THR15-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
-; THR15-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0
-; THR15-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1
-; THR15-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0
-; THR15-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
-; THR15-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; THR15-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3
-; THR15-NEXT:    [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i16>
-; THR15-NEXT:    [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16>
-; THR15-NEXT:    [[TMP14:%.*]] = sub <2 x i16> [[TMP11]], [[TMP13]]
-; THR15-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP14]], i32 1
-; THR15-NEXT:    [[TMP16:%.*]] = sext i16 [[TMP15]] to i32
-; THR15-NEXT:    [[SHL42_1:%.*]] = shl i32 [[TMP16]], 16
-; THR15-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP14]], i32 0
-; THR15-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
-; THR15-NEXT:    [[ADD43_1:%.*]] = add i32 [[SHL42_1]], [[TMP18]]
+; THR15-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32
 ; THR15-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; THR15-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; THR15-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; THR15-NEXT:    [[TMP19:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
+; THR15-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; THR15-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; THR15-NEXT:    [[TMP21:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
+; THR15-NEXT:    [[TMP87:%.*]] = zext i8 [[TMP6]] to i32
+; THR15-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; THR15-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
 ; THR15-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]]
-; THR15-NEXT:    [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; THR15-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; THR15-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
-; THR15-NEXT:    [[TMP26:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT:    [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
 ; THR15-NEXT:    [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]]
 ; THR15-NEXT:    [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]]
-; THR15-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; THR15-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; THR15-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
-; THR15-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT:    [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]]
+; THR15-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; THR15-NEXT:    [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
+; THR15-NEXT:    [[TMP86:%.*]] = zext i8 [[TMP7]] to i32
+; THR15-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
 ; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
-; THR15-NEXT:    [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
+; THR15-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
-; THR15-NEXT:    [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
+; THR15-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
 ; THR15-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]]
 ; THR15-NEXT:    [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]]
+; THR15-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]]
+; THR15-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]]
+; THR15-NEXT:    [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]]
 ; THR15-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
 ; THR15-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
 ; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]]
-; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP43]], [[TMP44]]
 ; THR15-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0
 ; THR15-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1
 ; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]]
-; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP46]]
-; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
-; THR15-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
-; THR15-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
-; THR15-NEXT:    [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1
 ; THR15-NEXT:    [[TMP48:%.*]] = load i8, ptr null, align 1
 ; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
 ; THR15-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
-; THR15-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1
 ; THR15-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
 ; THR15-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
-; THR15-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; THR15-NEXT:    [[TMP55:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; THR15-NEXT:    [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; THR15-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
-; THR15-NEXT:    [[TMP57:%.*]] = sub <2 x i32> [[TMP54]], [[TMP56]]
+; THR15-NEXT:    [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]]
 ; THR15-NEXT:    [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]]
-; THR15-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]]
+; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
+; THR15-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
+; THR15-NEXT:    [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
 ; THR15-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
-; THR15-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
 ; THR15-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
 ; THR15-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
 ; THR15-NEXT:    [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; THR15-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
 ; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; THR15-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
 ; THR15-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
-; THR15-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP59]]
-; THR15-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP59]], [[TMP71]]
+; THR15-NEXT:    [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
 ; THR15-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
 ; THR15-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
 ; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
-; THR15-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[ADD44_2]], i32 1
-; THR15-NEXT:    [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[ADD46_2]], i32 1
-; THR15-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP76]], [[TMP78]]
+; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]]
 ; THR15-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0
 ; THR15-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1
 ; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; THR15-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP73]], i32 [[SUB45_2]], i32 1
-; THR15-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> [[TMP83]], i32 [[SUB47_2]], i32 1
+; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; THR15-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
+; THR15-NEXT:    [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0
+; THR15-NEXT:    [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0
+; THR15-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]]
+; THR15-NEXT:    [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
+; THR15-NEXT:    [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0
+; THR15-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0
 ; THR15-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
-; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
-; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
+; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]]
+; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]]
 ; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15
 ; THR15-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; THR15-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
+; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15
 ; THR15-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; THR15-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; THR15-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
-; THR15-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1
+; THR15-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]]
+; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]]
 ; THR15-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15
 ; THR15-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; THR15-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; THR15-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
 ; THR15-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15
 ; THR15-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; THR15-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -517,19 +477,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
 ; THR15-NEXT:    [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; THR15-NEXT:    [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32>
-; THR15-NEXT:    [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; THR15-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
-; THR15-NEXT:    [[TMP96:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; THR15-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; THR15-NEXT:    [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; THR15-NEXT:    [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
 ; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]]
 ; THR15-NEXT:    [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP102:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT:    [[TMP104:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; THR15-NEXT:    [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
 ; THR15-NEXT:    [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]]
 ; THR15-NEXT:    [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], <i32 16, i32 16>
@@ -549,6 +512,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0
 ; THR15-NEXT:    [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1
 ; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]]
+; THR15-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]]
 ; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15
 ; THR15-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; THR15-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
@@ -557,55 +521,52 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
 ; THR15-NEXT:    [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; THR15-NEXT:    [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32>
-; THR15-NEXT:    [[TMP125:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
-; THR15-NEXT:    [[TMP127:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT:    [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; THR15-NEXT:    [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
 ; THR15-NEXT:    [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]]
 ; THR15-NEXT:    [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP133:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT:    [[TMP134:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; THR15-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP126]]
-; THR15-NEXT:    [[TMP136:%.*]] = add <2 x i32> [[TMP132]], [[TMP135]]
-; THR15-NEXT:    [[TMP137:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP138:%.*]] = insertelement <2 x i8> [[TMP137]], i8 [[TMP3]], i32 1
+; THR15-NEXT:    [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; THR15-NEXT:    [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32>
-; THR15-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[TMP9]], i32 0
+; THR15-NEXT:    [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; THR15-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; THR15-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]]
+; THR15-NEXT:    [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
+; THR15-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1
 ; THR15-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]]
-; THR15-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; THR15-NEXT:    [[SHL30_1:%.*]] = shl i32 [[TMP142]], 16
-; THR15-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP143]]
+; THR15-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]]
+; THR15-NEXT:    [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0
+; THR15-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]]
+; THR15-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]]
+; THR15-NEXT:    [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]]
+; THR15-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]]
 ; THR15-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0
 ; THR15-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1
+; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]]
 ; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]]
-; THR15-NEXT:    [[TMP146:%.*]] = shufflevector <2 x i32> [[TMP136]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; THR15-NEXT:    [[TMP147:%.*]] = insertelement <2 x i32> [[TMP146]], i32 [[ADD43_1]], i32 1
-; THR15-NEXT:    [[TMP148:%.*]] = insertelement <2 x i32> [[TMP136]], i32 [[ADD31_1]], i32 1
-; THR15-NEXT:    [[TMP149:%.*]] = add <2 x i32> [[TMP147]], [[TMP148]]
-; THR15-NEXT:    [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
 ; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
 ; THR15-NEXT:    [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
 ; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]]
 ; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]]
-; THR15-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; THR15-NEXT:    [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THR15-NEXT:    [[TMP153:%.*]] = insertelement <2 x i32> [[TMP152]], i32 [[SUB45_1]], i32 0
-; THR15-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP118]], i32 [[SUB47_1]], i32 0
-; THR15-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP153]], [[TMP154]]
-; THR15-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP151]], 15
+; THR15-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15
 ; THR15-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; THR15-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15
 ; THR15-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; THR15-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
 ; THR15-NEXT:    [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], <i32 15, i32 15>
 ; THR15-NEXT:    [[TMP157:%.*]] = and <2 x i32> [[TMP156]], <i32 65537, i32 65537>
 ; THR15-NEXT:    [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], <i32 65535, i32 65535>
-; THR15-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
-; THR15-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; THR15-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]]
+; THR15-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]]
 ; THR15-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; THR15-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
 ; THR15-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
@@ -613,16 +574,16 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; THR15-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]]
 ; THR15-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
+; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]]
 ; THR15-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP151]]
+; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]]
 ; THR15-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
 ; THR15-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]]
 ; THR15-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; THR15-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; THR15-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; THR15-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
-; THR15-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
+; THR15-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
+; THR15-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
 ; THR15-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
 ; THR15-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
 ; THR15-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
@@ -632,15 +593,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
 ; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]]
 ; THR15-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
-; THR15-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
+; THR15-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]]
 ; THR15-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; THR15-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]]
 ; THR15-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
 ; THR15-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; THR15-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; THR15-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; THR15-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
-; THR15-NEXT:    [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]]
+; THR15-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
+; THR15-NEXT:    [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
 ; THR15-NEXT:    [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
 ; THR15-NEXT:    [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; THR15-NEXT:    [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
@@ -665,10 +626,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1
 ; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]]
 ; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; THR15-NEXT:    [[TMP170:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
-; THR15-NEXT:    [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
-; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP170]], [[TMP171]]
-; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP171]], [[TMP170]]
+; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]]
+; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]]
 ; THR15-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; THR15-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; THR15-NEXT:    [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index 54eb564768318b..ce26bd3b89392d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -6,8 +6,9 @@
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr @dst, align 8
 ; CHECK-NEXT:    ret void



More information about the llvm-commits mailing list