[llvm] 6a3a5ca - Revert "[SLP]Add support for strided loads."

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 12 08:47:40 PST 2024


Author: Alexey Bataev
Date: 2024-02-12T08:47:28-08:00
New Revision: 6a3a5cad2e1249f1b685546ebe71b2ead9a27541

URL: https://github.com/llvm/llvm-project/commit/6a3a5cad2e1249f1b685546ebe71b2ead9a27541
DIFF: https://github.com/llvm/llvm-project/commit/6a3a5cad2e1249f1b685546ebe71b2ead9a27541.diff

LOG: Revert "[SLP]Add support for strided loads."

This reverts commit 0940f9083e68bda78bcbb323c2968a4294092e21 to fix
issues reported in https://github.com/llvm/llvm-project/pull/80310.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
    llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c94fb71ab220ba..c0b7298f78005d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,15 +175,6 @@ static cl::opt<int> RootLookAheadMaxDepth(
     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
-static cl::opt<unsigned> MinProfitableStridedLoads(
-    "slp-min-strided-loads", cl::init(2), cl::Hidden,
-    cl::desc("The minimum number of loads, which should be considered strided, "
-             "if the stride is > 1 or is runtime value"));
-
-static cl::opt<unsigned> MaxProfitableLoadStride(
-    "slp-max-stride", cl::init(8), cl::Hidden,
-    cl::desc("The maximum stride, considered to be profitable."));
-
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2584,7 +2575,7 @@ class BoUpSLP {
     enum EntryState {
       Vectorize,
       ScatterVectorize,
-      StridedVectorize,
+      PossibleStridedVectorize,
       NeedToGather
     };
     EntryState State;
@@ -2762,8 +2753,8 @@ class BoUpSLP {
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case StridedVectorize:
-        dbgs() << "StridedVectorize\n";
+      case PossibleStridedVectorize:
+        dbgs() << "PossibleStridedVectorize\n";
         break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
@@ -3689,7 +3680,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::StridedVectorize)
+        Entry->State == TreeEntry::PossibleStridedVectorize)
       return "color=blue";
     return "";
   }
@@ -3851,7 +3842,12 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
+enum class LoadsState {
+  Gather,
+  Vectorize,
+  ScatterVectorize,
+  PossibleStridedVectorize
+};
 } // anonymous namespace
 
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3882,14 +3878,6 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
   return CommonAlignment;
 }
 
-/// Check if \p Order represents reverse order.
-static bool isReverseOrder(ArrayRef<unsigned> Order) {
-  unsigned Sz = Order.size();
-  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
-    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
-  });
-}
-
 /// Checks if the given array of loads can be represented as a vectorized,
 /// scatter or just simple gather.
 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3912,8 +3900,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   // Make sure all loads in the bundle are simple - we can't vectorize
   // atomic or volatile loads.
   PointerOps.clear();
-  const unsigned Sz = VL.size();
-  PointerOps.resize(Sz);
+  PointerOps.resize(VL.size());
   auto *POIter = PointerOps.begin();
   for (Value *V : VL) {
     auto *L = cast<LoadInst>(V);
@@ -3924,12 +3911,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   }
 
   Order.clear();
-  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
+    bool IsPossibleStrided = false;
     if (IsSorted) {
       Value *Ptr0;
       Value *PtrN;
@@ -3943,71 +3930,30 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       std::optional<int> Diff =
           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
       // Check that the sorted loads are consecutive.
-      if (static_cast<unsigned>(*Diff) == Sz - 1)
+      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
         return LoadsState::Vectorize;
       // Simple check if not a strided access - clear order.
-      bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-      // Try to generate strided load node if:
-      // 1. Target with strided load support is detected.
-      // 2. The number of loads is greater than MinProfitableStridedLoads,
-      // or the potential stride <= MaxProfitableLoadStride and the
-      // potential stride is power-of-2 (to avoid perf regressions for the very
-      // small number of loads) and max distance > number of loads, or potential
-      // stride is -1.
-      // 3. The loads are ordered, or number of unordered loads <=
-      // MaxProfitableUnorderedLoads, or loads are in reversed order.
-      // (this check is to avoid extra costs for very expensive shuffles).
-      if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
-                                  (static_cast<unsigned>(std::abs(*Diff)) <=
-                                       MaxProfitableLoadStride * Sz &&
-                                   isPowerOf2_32(std::abs(*Diff)))) &&
-                                 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
-                                *Diff == -(static_cast<int>(Sz) - 1))) {
-        int Stride = *Diff / static_cast<int>(Sz - 1);
-        if (*Diff == Stride * static_cast<int>(Sz - 1)) {
-          Align Alignment =
-              cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-                  ->getAlign();
-          if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
-            // Iterate through all pointers and check if all distances are
-            // unique multiple of Dist.
-            SmallSet<int, 4> Dists;
-            for (Value *Ptr : PointerOps) {
-              int Dist = 0;
-              if (Ptr == PtrN)
-                Dist = *Diff;
-              else if (Ptr != Ptr0)
-                Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-              // If the strides are not the same or repeated, we can't
-              // vectorize.
-              if (((Dist / Stride) * Stride) != Dist ||
-                  !Dists.insert(Dist).second)
-                break;
-            }
-            if (Dists.size() == Sz)
-              return LoadsState::StridedVectorize;
-          }
-        }
-      }
+      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
     }
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
     // increases the cost.
     Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        static_cast<unsigned>(count_if(
-            PointerOps,
-            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
-        Sz > 2;
+        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+          return L && L->isLoopInvariant(V);
+        })) <= VL.size() / 2 && VL.size() > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
                  (GEP && GEP->getNumOperands() == 2);
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
           !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
-        return LoadsState::ScatterVectorize;
+        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
+                                 : LoadsState::ScatterVectorize;
     }
   }
 
@@ -4214,7 +4160,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if ((TE.State == TreeEntry::Vectorize ||
-       TE.State == TreeEntry::StridedVectorize) &&
+       TE.State == TreeEntry::PossibleStridedVectorize) &&
       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
       !TE.isAltShuffle())
@@ -4472,7 +4418,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) ||
+            TE->State == TreeEntry::PossibleStridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -4496,6 +4442,9 @@ void BoUpSLP::reorderTopToBottom() {
     MapVector<OrdersType, unsigned,
               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
         OrdersUses;
+    // Last chance orders - scatter vectorize. Try to use their orders if no
+    // other orders or the order is counted already.
+    SmallVector<OrdersType> StridedVectorizeOrders;
     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4542,6 +4491,11 @@ void BoUpSLP::reorderTopToBottom() {
         if (Order.empty())
           continue;
       }
+      // Postpone scatter orders.
+      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+        StridedVectorizeOrders.push_back(Order);
+        continue;
+      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4558,6 +4512,22 @@ void BoUpSLP::reorderTopToBottom() {
         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
       }
     }
+    // Set order of the user node.
+    if (OrdersUses.empty()) {
+      if (StridedVectorizeOrders.empty())
+        continue;
+      // Add (potentially!) strided vectorize orders.
+      for (OrdersType &Order : StridedVectorizeOrders)
+        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+    } else {
+      // Account (potentially!) strided vectorize orders only if it was used
+      // already.
+      for (OrdersType &Order : StridedVectorizeOrders) {
+        auto *It = OrdersUses.find(Order);
+        if (It != OrdersUses.end())
+          ++It->second;
+      }
+    }
     // Choose the most used order.
     ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
     unsigned Cnt = OrdersUses.front().second;
@@ -4599,7 +4569,7 @@ void BoUpSLP::reorderTopToBottom() {
         continue;
       }
       if ((TE->State == TreeEntry::Vectorize ||
-           TE->State == TreeEntry::StridedVectorize) &&
+           TE->State == TreeEntry::PossibleStridedVectorize) &&
           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
               InsertElementInst>(TE->getMainOp()) &&
           !TE->isAltShuffle()) {
@@ -4640,6 +4610,10 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
+      // require reordering of the operands, which is not implemented yet.
+      if (TE->State == TreeEntry::PossibleStridedVectorize)
+        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4690,13 +4664,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::StridedVectorize)
+        TE->State != TreeEntry::PossibleStridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) ||
+            TE->State == TreeEntry::PossibleStridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
@@ -4714,7 +4688,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize ||
+            TE->State == TreeEntry::PossibleStridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
              GathersToOrders.count(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4759,6 +4733,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       MapVector<OrdersType, unsigned,
                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
           OrdersUses;
+      // Last chance orders - scatter vectorize. Try to use their orders if no
+      // other orders or the order is counted already.
+      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
       // Do the analysis for each tree entry only once, otherwise the order of
       // the same node my be considered several times, though might be not
       // profitable.
@@ -4780,6 +4757,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
             });
+        // Postpone scatter orders.
+        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
+          StridedVectorizeOrders.emplace_back(Order, NumOps);
+          continue;
+        }
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4837,6 +4819,30 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
+      // If no orders - skip current nodes and jump to the next one, if any.
+      if (OrdersUses.empty()) {
+        if (StridedVectorizeOrders.empty() ||
+            (Data.first->ReorderIndices.empty() &&
+             Data.first->ReuseShuffleIndices.empty() &&
+             !(IgnoreReorder &&
+               Data.first == VectorizableTree.front().get()))) {
+          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+            OrderedEntries.remove(Op.second);
+          continue;
+        }
+        // Add (potentially!) strided vectorize orders.
+        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
+          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
+              Pair.second;
+      } else {
+        // Account (potentially!) strided vectorize orders only if it was used
+        // already.
+        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
+          auto *It = OrdersUses.find(Pair.first);
+          if (It != OrdersUses.end())
+            It->second += Pair.second;
+        }
+      }
       // Choose the best order.
       ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
       unsigned Cnt = OrdersUses.front().second;
@@ -4872,7 +4878,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
-            TE->State != TreeEntry::StridedVectorize &&
+            TE->State != TreeEntry::PossibleStridedVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -4904,7 +4910,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::StridedVectorize) {
+          Data.first->State == TreeEntry::PossibleStridedVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -4967,6 +4973,7 @@ void BoUpSLP::buildExternalUses(
           // instructions. If that is the case, the one in FoundLane will
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
+              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
               !doesInTreeUserNeedToExtract(
                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5324,8 +5331,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
       return TreeEntry::ScatterVectorize;
-    case LoadsState::StridedVectorize:
-      return TreeEntry::StridedVectorize;
+    case LoadsState::PossibleStridedVectorize:
+      return TreeEntry::PossibleStridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
       Type *ScalarTy = VL0->getType();
@@ -5746,7 +5753,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
-      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
+      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
   bool AreAllSameInsts =
       (S.getOpcode() && allSameBlock(VL)) ||
       (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5843,7 +5851,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (AreAllSameInsts && UserTreeIdx.UserTE &&
-      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
+      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
+       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
       !(S.getOpcode() && allSameBlock(VL))) {
     assert(S.OpValue->getType()->isPointerTy() &&
            count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6040,17 +6049,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         TE->setOperandsInOrder();
         break;
-      case TreeEntry::StridedVectorize:
+      case TreeEntry::PossibleStridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies);
         } else {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
         }
         TE->setOperandsInOrder();
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7081,7 +7091,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         !isSplat(Gathers)) {
       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
       SetVector<Value *> VectorizedLoads;
-      SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
+      SmallVector<unsigned> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
       unsigned StartIdx = 0;
       unsigned VF = VL.size() / 2;
@@ -7105,16 +7115,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
-            case LoadsState::StridedVectorize:
+            case LoadsState::PossibleStridedVectorize:
               // Mark the vectorized loads so that we don't vectorize them
               // again.
               // TODO: better handling of loads with reorders.
-              if (((LS == LoadsState::Vectorize ||
-                    LS == LoadsState::StridedVectorize) &&
-                   CurrentOrder.empty()) ||
-                  (LS == LoadsState::StridedVectorize &&
-                   isReverseOrder(CurrentOrder)))
-                VectorizedStarts.emplace_back(Cnt, LS);
+              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
+                VectorizedStarts.push_back(Cnt);
               else
                 ScatterVectorized.push_back(Cnt);
               VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7158,20 +7164,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
-        for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
-          auto *LI = cast<LoadInst>(VL[P.first]);
+        for (unsigned P : VectorizedStarts) {
+          auto *LI = cast<LoadInst>(VL[P]);
           Align Alignment = LI->getAlign();
           GatherCost +=
-              P.second == LoadsState::Vectorize
-                  ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                        LI->getPointerAddressSpace(), CostKind,
-                                        TTI::OperandValueInfo(), LI)
-                  : TTI.getStridedMemoryOpCost(
-                        Instruction::Load, LoadTy, LI->getPointerOperand(),
-                        /*VariableMask=*/false, Alignment, CostKind, LI);
+              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                  LI->getPointerAddressSpace(), CostKind,
+                                  TTI::OperandValueInfo(), LI);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
-          for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
+          for (auto [I, V] : enumerate(VL.slice(P, VF)))
             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
           auto [ScalarGEPCost, VectorGEPCost] =
               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7911,9 +7913,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
-  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
-      (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
+      E->State != TreeEntry::PossibleStridedVectorize) {
     SmallVector<int> NewMask;
     if (E->getOpcode() == Instruction::Store) {
       // For stores the order is actually a mask.
@@ -7931,7 +7932,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
+          E->State == TreeEntry::PossibleStridedVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -7951,8 +7952,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   auto GetCastContextHint = [&](Value *V) {
     if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize ||
-          OpTE->State == TreeEntry::StridedVectorize)
+      if (OpTE->State == TreeEntry::ScatterVectorize)
         return TTI::CastContextHint::GatherScatter;
       if (OpTE->State == TreeEntry::Vectorize &&
           OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,9 +8028,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Calculate cost 
diff erence from vectorizing set of GEPs.
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
-    assert((E->State == TreeEntry::Vectorize ||
-            E->State == TreeEntry::StridedVectorize) &&
-           "Entry state expected to be Vectorize or StridedVectorize here.");
+    assert(E->State == TreeEntry::Vectorize &&
+           "Entry state expected to be Vectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8383,14 +8382,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
-      } else if (E->State == TreeEntry::StridedVectorize) {
-        Align CommonAlignment =
-            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
-        VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
-            /*VariableMask=*/false, CommonAlignment, CostKind);
       } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+        assert((E->State == TreeEntry::ScatterVectorize ||
+                E->State == TreeEntry::PossibleStridedVectorize) &&
+               "Unknown EntryState");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getGatherScatterOpCost(
@@ -8403,7 +8398,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
     // If this node generates masked gather load then it is not a terminal node.
     // Hence address operand cost is estimated separately.
-    if (E->State == TreeEntry::ScatterVectorize)
+    if (E->State == TreeEntry::ScatterVectorize ||
+        E->State == TreeEntry::PossibleStridedVectorize)
       return Cost;
 
     // Estimate cost of GEPs since this tree node is a terminator.
@@ -8612,7 +8608,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::StridedVectorize))
+       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
     return false;
 
   return true;
@@ -10583,6 +10579,11 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
+  if (E->State == TreeEntry::PossibleStridedVectorize &&
+      !E->ReorderIndices.empty()) {
+    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
+    reorderScalars(VL, Mask);
+  }
   const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
@@ -11156,7 +11157,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return Vec;
   }
 
-  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
                           bool IsSigned) {
     if (V->getType() != VecTy)
@@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
       ShuffleBuilder.add(V, Mask);
-    } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
+    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
       ShuffleBuilder.addOrdered(V, std::nullopt);
     } else {
       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
+          E->State == TreeEntry::PossibleStridedVectorize) &&
          "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,29 +11642,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
-      } else if (E->State == TreeEntry::StridedVectorize) {
-        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
-        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
-        PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
-        Type *StrideTy = DL->getIndexType(PO->getType());
-        int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
-        Value *StrideVal =
-            ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                           DL->getTypeAllocSize(ScalarTy));
-        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
-        auto *Inst = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
-        Inst->addParamAttr(
-            /*ArgNo=*/0,
-            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
-        NewLI = Inst;
       } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+        assert((E->State == TreeEntry::ScatterVectorize ||
+                E->State == TreeEntry::PossibleStridedVectorize) &&
+               "Unhandled state");
         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12088,11 +12069,8 @@ Value *BoUpSLP::vectorizeTree(
                      [&](llvm::User *U) {
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
-                              (UseEntry->State == TreeEntry::Vectorize ||
-                               UseEntry->State ==
-                                   TreeEntry::StridedVectorize) &&
-                              (E->State == TreeEntry::Vectorize ||
-                               E->State == TreeEntry::StridedVectorize) &&
+                              UseEntry->State == TreeEntry::Vectorize &&
+                              E->State == TreeEntry::Vectorize &&
                               doesInTreeUserNeedToExtract(
                                   Scalar,
                                   cast<Instruction>(UseEntry->Scalars.front()),

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index e167b6a47af592..dc5fb917886347 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
@@ -67,303 +67,305 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
 ; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
 ; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
-; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
-; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
+; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
+; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; CHECK-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
+; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
+; CHECK-NEXT:    [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
+; CHECK-NEXT:    [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
-; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
-; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
-; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
-; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
-; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
-; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
-; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
+; CHECK-NEXT:    [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
+; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT:    [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
+; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
+; CHECK-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
+; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
+; CHECK-NEXT:    [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
+; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
+; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
+; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
+; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
+; CHECK-NEXT:    [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
-; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
-; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
-; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
-; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
-; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
-; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
-; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
-; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
-; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT:    [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
+; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
+; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
+; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
+; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
+; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
+; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
+; CHECK-NEXT:    [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
+; CHECK-NEXT:    [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
+; CHECK-NEXT:    [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
+; CHECK-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
+; CHECK-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
-; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
-; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
-; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
-; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
-; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
-; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
+; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
+; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
+; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT:    [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
+; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
+; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
+; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
+; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
+; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
+; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
+; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
+; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
+; CHECK-NEXT:    [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
+; CHECK-NEXT:    [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
-; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
-; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
-; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
-; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
-; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
-; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
-; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
-; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
-; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
-; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
+; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
+; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
+; CHECK-NEXT:    [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
+; CHECK-NEXT:    [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
+; CHECK-NEXT:    [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT:    [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
+; CHECK-NEXT:    [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
+; CHECK-NEXT:    [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
+; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
+; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
+; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
+; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
+; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
+; CHECK-NEXT:    [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
+; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
+; CHECK-NEXT:    [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT:    [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
+; CHECK-NEXT:    [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
+; CHECK-NEXT:    [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
+; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
+; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
+; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
+; CHECK-NEXT:    [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
-; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
-; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
-; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
-; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
-; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
-; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
+; CHECK-NEXT:    [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT:    [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
+; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
+; CHECK-NEXT:    [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
+; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
+; CHECK-NEXT:    [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
+; CHECK-NEXT:    [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
+; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
+; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
+; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
+; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 4b0b41970bbb4d..a4cc311d12a217 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,12 +5,61 @@ define void @test(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
+; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
+; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
+; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -213,40 +262,67 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
+; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ST5:%.*]] = mul i64 [[STR]], 6
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
 ; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
+; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    [[ST4:%.*]] = mul i64 [[STR]], 5
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
 ; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
+; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[ST3:%.*]] = mul i64 [[STR]], 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
 ; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
+; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
 ; CHECK-NEXT:    [[ST2:%.*]] = mul i64 [[STR]], 3
+; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
 ; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
+; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
+; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[ST1:%.*]] = mul i64 [[STR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
 ; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
+; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
+; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
 ; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
+; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
+; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
 ; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
+; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -321,12 +397,27 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index ec152c707eec6b..5aba9ea115a4b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8ab57cc73e646f..8f2c72bb4c6856 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,11 +5,14 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   %0 = load i8, ptr %a, align 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 9e43cefef2801d..96d4c307f1c67f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:       bb:
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 3bc6e64606e399..1add732d32e85c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer


        


More information about the llvm-commits mailing list