[llvm] [SLPVectorizer] Widen constant strided loads. (PR #162324)

Fri Oct 17 10:57:54 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/162324

>From 6e690f52bd80e9a861044d18d77abe30330b9ce5 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Thu, 16 Oct 2025 11:19:13 -0700
Subject: [PATCH 01/11] [SLPVectorizer] Refactor isStridedLoad, NFC.

Move the checks that all strides are the same from `isStridedLoad` to a
new function `analyzeConstantStrideCandidate`. This is to reduce the
diff for the following MRs which will modify the logic in
`analyzeConstantStrideCandidate` to cover the case of widening of the strided
load. All the checks that are left in `isStridedLoad` will be reused.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 104 +++++++++++++-----
 1 file changed, 77 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b62c8f1631ff7..e5415f5d61b50 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,48 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
   bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                     Align Alignment, const int64_t Diff, Value *Ptr0,
-                     Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+                     Align Alignment, int64_t Diff, size_t Sz) const;
+
+  /// Return true if an array of scalar loads can be replaced with a strided
+  ///  load (with constant stride).
+  ///
+  ///  TODO:
+  ///  It is possible that the load gets "widened". Suppose that originally each
+  ///  load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
+  ///  constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + (w - 1)
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + (w - 1)
+  ///  ...
+  ///
+  ///  %b + (n - 1) * %s + 0
+  ///  %b + (n - 1) * %s + 1
+  ///  %b + (n - 1) * %s + 2
+  ///  ...
+  ///  %b + (n - 1) * %s + (w - 1)
+  ///
+  /// In this case we will generate a strided load of type `<n x (k * w)>`.
+  ///
+  /// \param PointerOps list of pointer arguments of loads.
+  /// \param ElemTy original scalar type of loads.
+  /// \param Alignment alignment of the first load.
+  /// \param SortedIndices is the order of PointerOps as returned by
+  /// `sortPtrAccesses`
+  /// \param Diff Pointer difference between the lowest and the highes pointer
+  /// in `PointerOps` as returned by `getPointersDiff`.
+  /// \param Ptr0 first pointer in `PointersOps`.
+  /// \param PtrN last pointer in `PointersOps`.
+  /// \param SPtrInfo If the function return `true`, it also sets all the fields
+  /// of `SPtrInfo` necessary to generate the strided load later.
+  bool analyzeConstantStrideCandidate(
+      const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
+      const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+      Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
 
   /// Return true if an array of scalar loads can be replaced with a strided
   /// load (with run-time stride).
@@ -6849,9 +6889,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
 bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                            Align Alignment, const int64_t Diff, Value *Ptr0,
-                            Value *PtrN, StridedPtrInfo &SPtrInfo) const {
-  const size_t Sz = PointerOps.size();
+                            Align Alignment, int64_t Diff, size_t Sz) const {
   if (Diff % (Sz - 1) != 0)
     return false;
 
@@ -6875,27 +6913,39 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
       return false;
     if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
       return false;
+  }
+  return true;
+}
 
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
-        break;
-    }
-    if (Dists.size() == Sz) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
-      SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
-      return true;
-    }
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align CommonAlignment,
+    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+    Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+  const size_t Sz = PointerOps.size();
+  if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, Sz))
+    return false;
+
+  int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
+
+  // Iterate through all pointers and check if all distances are
+  // unique multiple of Dist.
+  SmallSet<int64_t, 4> Dists;
+  for (Value *Ptr : PointerOps) {
+    int64_t Dist = 0;
+    if (Ptr == PtrN)
+      Dist = Diff;
+    else if (Ptr != Ptr0)
+      Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
+    // If the strides are not the same or repeated, we can't
+    // vectorize.
+    if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+      break;
+  }
+  if (Dists.size() == Sz) {
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+    SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
+    return true;
   }
   return false;
 }
@@ -6995,8 +7045,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     Align Alignment =
         cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
             ->getAlign();
-    if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
-                      SPtrInfo))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+                                       *Diff, Ptr0, PtrN, SPtrInfo))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

>From 13e19eb3d2e29b7c5f8363bb49407abc4952b85e Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Thu, 16 Oct 2025 13:21:32 -0700
Subject: [PATCH 02/11] add `const`

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e5415f5d61b50..6caec9e316c2c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,7 +2242,7 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
   bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                     Align Alignment, int64_t Diff, size_t Sz) const;
+                     Align Alignment, const int64_t Diff, size_t Sz) const;
 
   /// Return true if an array of scalar loads can be replaced with a strided
   ///  load (with constant stride).
@@ -6889,7 +6889,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
 bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                            Align Alignment, int64_t Diff, size_t Sz) const {
+                            Align Alignment, const int64_t Diff, size_t Sz) const {
   if (Diff % (Sz - 1) != 0)
     return false;
 

>From ae05217ba6dacb2b25e8d2d3a925d356a6c68cb4 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 17 Oct 2025 08:47:30 -0700
Subject: [PATCH 03/11] fix a bug.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6caec9e316c2c..5ec71ba3b1176 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6889,7 +6889,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
 bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                            Align Alignment, const int64_t Diff, size_t Sz) const {
+                            Align Alignment, const int64_t Diff,
+                            size_t Sz) const {
   if (Diff % (Sz - 1) != 0)
     return false;
 
@@ -6913,8 +6914,9 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
       return false;
     if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
       return false;
+    return true;
   }
-  return true;
+  return false;
 }
 
 bool BoUpSLP::analyzeConstantStrideCandidate(

>From 01a7e4f66259edd0ddd028c726c6eb8223a68feb Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 17 Oct 2025 08:51:04 -0700
Subject: [PATCH 04/11] CommonAlignment => Alignment, size_t => const size_t

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5ec71ba3b1176..9cd52da1ce1cd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,7 +2242,8 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
   bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                     Align Alignment, const int64_t Diff, size_t Sz) const;
+                     Align Alignment, const int64_t Diff,
+                     const size_t Sz) const;
 
   /// Return true if an array of scalar loads can be replaced with a strided
   ///  load (with constant stride).
@@ -6890,7 +6891,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// might be required).
 bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
                             Align Alignment, const int64_t Diff,
-                            size_t Sz) const {
+                            const size_t Sz) const {
   if (Diff % (Sz - 1) != 0)
     return false;
 
@@ -6920,11 +6921,11 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
 }
 
 bool BoUpSLP::analyzeConstantStrideCandidate(
-    const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align CommonAlignment,
+    const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
     const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
     Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
   const size_t Sz = PointerOps.size();
-  if (!isStridedLoad(PointerOps, ScalarTy, CommonAlignment, Diff, Sz))
+  if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
     return false;
 
   int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);

>From 60078b01e263e3400c86d0c4375205d283183692 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 23 Sep 2025 15:41:57 -0700
Subject: [PATCH 05/11] [SLPVectorizer] Widen constant strided loads.

Given a set of pointers, check if they can be rearranged as follows (%s is a constant):
%b + 0 * %s + 0
%b + 0 * %s + 1
%b + 0 * %s + 2
...
%b + 0 * %s + w

%b + 1 * %s + 0
%b + 1 * %s + 1
%b + 1 * %s + 2
...
%b + 1 * %s + w
...

If the pointers can be rearanged in the above pattern, it means that the
memory can be accessed with a strided loads of width `w` and stride `%s`.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 121 ++++++++++++++----
 .../RISCV/basic-strided-loads.ll              |  18 +--
 2 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9cd52da1ce1cd..3328fdc78246f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2248,7 +2248,6 @@ class BoUpSLP {
   /// Return true if an array of scalar loads can be replaced with a strided
   ///  load (with constant stride).
   ///
-  ///  TODO:
   ///  It is possible that the load gets "widened". Suppose that originally each
   ///  load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
   ///  constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
@@ -6921,36 +6920,92 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
 }
 
 bool BoUpSLP::analyzeConstantStrideCandidate(
-    const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
-    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
-    Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
-  const size_t Sz = PointerOps.size();
-  if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
-    return false;
+    ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
+    const SmallVectorImpl<unsigned> &SortedIndices, int64_t Diff, Value *Ptr0,
+    Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+  const unsigned Sz = PointerOps.size();
+  SmallVector<int64_t> SortedOffsetsFromBase;
+  SortedOffsetsFromBase.resize(Sz);
+  for (unsigned I : seq<unsigned>(Sz)) {
+    Value *Ptr =
+        SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+    SortedOffsetsFromBase[I] =
+        *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+  }
+  assert(SortedOffsetsFromBase.size() > 1 &&
+         "Trying to generate strided load for less than 2 loads");
+  //
+  // Find where the first group ends.
+  int64_t StrideWithinGroup =
+      SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  unsigned GroupSize = 1;
+  for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+    if (SortedOffsetsFromBase[GroupSize] -
+            SortedOffsetsFromBase[GroupSize - 1] !=
+        StrideWithinGroup)
+      break;
+  }
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  int64_t StrideIntVal = StrideWithinGroup;
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
 
-  int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
+  bool NeedsWidening = Sz != GroupSize;
+  if (NeedsWidening) {
+    if (Sz % GroupSize != 0)
+      return false;
+    VecSz = Sz / GroupSize;
 
-  // Iterate through all pointers and check if all distances are
-  // unique multiple of Dist.
-  SmallSet<int64_t, 4> Dists;
-  for (Value *Ptr : PointerOps) {
-    int64_t Dist = 0;
-    if (Ptr == PtrN)
-      Dist = Diff;
-    else if (Ptr != Ptr0)
-      Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
-    // If the strides are not the same or repeated, we can't
-    // vectorize.
-    if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
-      break;
+    if (StrideWithinGroup != 1)
+      return false;
+    unsigned VecSz = Sz / GroupSize;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   GroupSize);
+    StridedLoadTy = getWidenedType(ScalarTy, VecSz);
   }
-  if (Dists.size() == Sz) {
-    Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
-    SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
-    return true;
+
+  if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, VecSz))
+    return false;
+
+  if (NeedsWidening) {
+    unsigned PrevGroupStartIdx = 0;
+    unsigned CurrentGroupStartIdx = GroupSize;
+    int64_t StrideBetweenGroups =
+        SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+    StrideIntVal = StrideBetweenGroups;
+    while (CurrentGroupStartIdx != Sz) {
+      if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+          StrideBetweenGroups)
+        break;
+      PrevGroupStartIdx = CurrentGroupStartIdx;
+      CurrentGroupStartIdx += GroupSize;
+    }
+    if (CurrentGroupStartIdx != Sz)
+      return false;
+
+    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+                          int64_t StrideWithinGroup) -> bool {
+      unsigned GroupEndIdx = StartIdx + 1;
+      for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+        if (SortedOffsetsFromBase[GroupEndIdx] -
+                SortedOffsetsFromBase[GroupEndIdx - 1] !=
+            StrideWithinGroup)
+          break;
+      }
+      return GroupEndIdx - StartIdx == GroupSize0;
+    };
+    for (unsigned I = 0; I < Sz; I += GroupSize) {
+      if (!CheckGroup(I, GroupSize, StrideWithinGroup))
+        return false;
+    }
   }
-  return false;
+
+  Type *StrideTy = DL->getIndexType(Ptr0->getType());
+  SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+  SPtrInfo.Ty = StridedLoadTy;
+  return true;
 }
 
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
@@ -14969,11 +15024,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         break;
       case TreeEntry::StridedVectorize: {
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+                                    getCastContextHint(*E), CostKind);
+
         break;
       }
       case TreeEntry::CompressVectorize: {
@@ -19740,6 +19803,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                      ? NewLI
                      : ::propagateMetadata(NewLI, E->Scalars);
 
+      if (StridedLoadTy != VecTy)
+        V = Builder.CreateBitOrPointerCast(V, VecTy);
       V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 02e05b2e4138a..968e61fb5f9e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -621,22 +621,10 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
-; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
-; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
-; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0

>From 9e2b0d1633ca58bf95297311e0d436ec918c3308 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 24 Sep 2025 10:40:27 -0700
Subject: [PATCH 06/11] updated failing tests.

---
 .../RISCV/gather-insert-point-restore.ll      |  15 +--
 .../X86/entries-shuffled-diff-sizes.ll        |  16 +--
 .../X86/extractelements-vector-ops-shuffle.ll |   7 +-
 .../Transforms/SLPVectorizer/X86/pr47623.ll   |  14 +--
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 112 +++++++++---------
 .../Transforms/SLPVectorizer/X86/pr47629.ll   | 112 +++++++++---------
 .../X86/redux-feed-buildvector.ll             |  73 +++++++++---
 .../X86/reorder-possible-strided-node.ll      |  36 ++----
 .../X86/split-node-num-operands.ll            |  22 ++--
 .../X86/split-vector-operand-with-reuses.ll   |  28 +++--
 10 files changed, 226 insertions(+), 209 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c940353ba5a..60d3b291b5dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -5,18 +5,19 @@ define i16 @test(ptr %i) {
 ; CHECK-LABEL: define i16 @test(
 ; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 132860, i64 137774>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[GEP_US154:%.*]] = getelementptr i8, ptr [[I]], i64 132860
 ; CHECK-NEXT:    [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688
 ; CHECK-NEXT:    br label %[[FOR_COND5_US:.*]]
 ; CHECK:       [[FOR_COND5_US]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[GEP_US154]], i64 4914, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0)
 ; CHECK-NEXT:    ret i16 [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
index b99a1c2d83394..dbcaafa9e5a8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
@@ -15,13 +15,15 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> <i32 poison, i32 0, i32 20, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19, i32 18>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <88 x float> @llvm.masked.load.v88f32.p0(ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), i32 16, <88 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <88 x float> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 87>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 8 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), i64 336, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <88 x float> [[TMP7]], <88 x float> poison, <4 x i32> <i32 poison, i32 87, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 3, i32 3, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 ; CHECK-NEXT:    store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
index 7bbc694dc5181..12cb86287ce6f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
@@ -4,12 +4,11 @@
 define double @test() {
 ; CHECK-LABEL: define double @test() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, double [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i64(ptr align 8 getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), i64 24, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index 1b11c3dcc081c..a577469a9aef0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -11,16 +11,10 @@
 
 define void @foo() {
 ; SSE-LABEL: @foo(
-; SSE-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
-; SSE-NEXT:    store i32 [[TMP1]], ptr @a, align 16
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    store <8 x i32> [[TMP3]], ptr @a, align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @foo(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index fde76f8b0e8b9..ca129e7ab97f8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX-NEXT:    store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
-; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: define void @gather_load_2(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX2-NEXT:    store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
-; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX2-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: define void @gather_load_2(
@@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-LABEL: define void @gather_load_div(
 ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
@@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> <i32 poison, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
 ; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
-; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7
+; AVX-NEXT:    [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> <i32 1, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
 ; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
 ; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
@@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-LABEL: define void @gather_load_div(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
@@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> <i32 poison, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
 ; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
-; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7
+; AVX2-NEXT:    [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> <i32 1, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
 ; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
 ; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index cf380f04a6939..f651fa53c53c4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -153,36 +153,46 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX-NEXT:    store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
-; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: define void @gather_load_2(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX2-NEXT:    store i32 [[TMP11]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    store i32 [[TMP15]], ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
-; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX2-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: define void @gather_load_2(
@@ -569,11 +579,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-LABEL: define void @gather_load_div(
 ; AVX-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
@@ -583,30 +591,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> <i32 poison, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
 ; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
-; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7
+; AVX-NEXT:    [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> <i32 1, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
 ; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
 ; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
@@ -618,11 +621,9 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-LABEL: define void @gather_load_div(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
@@ -632,30 +633,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[SHORT_TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP22:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP5]], i64 8, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP7]], i64 120, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[TMP17]], i64 20, <2 x i1> splat (i1 true), i32 2), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP22]], <8 x i32> <i32 poison, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
 ; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
-; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP28]], i32 7
+; AVX2-NEXT:    [[TMP37:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> [[TMP29]], <8 x i32> <i32 1, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
 ; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
 ; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index f921278cdecf3..5bf3783034190 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -11,23 +11,62 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
-; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
-; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
+; CHECK-NEXT:    [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
+; CHECK-NEXT:    [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
+; CHECK-NEXT:    [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
+; CHECK-NEXT:    [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
+; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
+; CHECK-NEXT:    [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
+; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
+; CHECK-NEXT:    [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
+; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
+; CHECK-NEXT:    [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
+; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
+; CHECK-NEXT:    [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = call <24 x double> @llvm.masked.load.v24f64.p0(ptr [[ARG1:%.*]], i32 8, <24 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <24 x double> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 0, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 1, i32 17>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 2, i32 18>
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 3, i32 19>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP17]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <2 x double> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd fast <2 x double> [[TMP15]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 4, i32 20>
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP22]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul fast <2 x double> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x double> [[TMP20]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 5, i32 21>
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <2 x double> [[TMP26]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fadd fast <2 x double> [[TMP25]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 6, i32 22>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul fast <2 x double> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = fadd fast <2 x double> [[TMP30]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <24 x double> [[TMP0]], <24 x double> poison, <2 x i32> <i32 7, i32 23>
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x double> [[TMP37]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]]
+; CHECK-NEXT:    [[I143:%.*]] = fadd fast <2 x double> [[TMP35]], [[TMP39]]
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 19ce11c457f63..bd24093218874 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -5,18 +5,15 @@ define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -66,18 +63,15 @@ define void @test1() {
 ; CHECK-LABEL: define void @test1(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP14]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
@@ -129,18 +123,15 @@ define void @test_div() {
 ; CHECK-LABEL: define void @test_div(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], <i32 2, i32 1, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -190,18 +181,15 @@ define void @test_rem() {
 ; CHECK-LABEL: define void @test_rem(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX13]], i64 -128, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], <i32 1, i32 1, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll
index 5aa4dba2b8a1b..4deddc138727a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll
@@ -13,31 +13,31 @@ define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP7]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[TMP1]], i32 1
 ; CHECK-NEXT:    br label %[[BB16:.*]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[TMP25]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[_LOOPEXIT206:.*]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[_LOOPEXIT206]] ]
 ; CHECK-NEXT:    switch i32 0, label %[[BB19:.*]] [
-; CHECK-NEXT:      i32 0, label %[[TMP25]]
+; CHECK-NEXT:      i32 0, label %[[_LOOPEXIT206]]
 ; CHECK-NEXT:    ]
 ; CHECK:       [[BB19]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 0, i32 2
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i64> [[TMP22]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    br label %[[TMP25]]
-; CHECK:       [[TMP25]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    br label %[[_LOOPEXIT206]]
+; CHECK:       [[_LOOPEXIT206]]:
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x i64> [ [[TMP17]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i64> [ [[TMP23]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x i64> [ [[TMP24]], %[[BB19]] ], [ [[TMP15]], %[[BB16]] ]
-; CHECK-NEXT:    [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> <i64 0, i64 poison>, <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[DOTLOOPEXIT206:.*]], label %[[BB16]]
-; CHECK:       [[_LOOPEXIT206:.*:]]
+; CHECK:       [[_LOOPEXIT207:.*:]]
 ; CHECK-NEXT:    switch i32 0, label %[[BB32:.*]] [
 ; CHECK-NEXT:      i32 0, [[DOTCONT174:label %.*]]
 ; CHECK-NEXT:      i32 1, label %[[BB30:.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
index 972a58cecc822..ed0782fb5b84d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
@@ -6,31 +6,33 @@ define void @test(ptr %p) {
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[ARRAYIDX7_US_I_841:%.*]] = getelementptr i8, ptr [[P]], i64 36
+; CHECK-NEXT:    [[ARRAYIDX7_US_I_1051:%.*]] = getelementptr i8, ptr [[P]], i64 44
 ; CHECK-NEXT:    [[ARRAYIDX7_US_I_1261:%.*]] = getelementptr i8, ptr [[P]], i64 52
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i64(ptr align 4 [[ARRAYIDX7_US_I_1051]], i64 -44, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> [[TMP20]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 6
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = srem <16 x i32> [[TMP13]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <12 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = srem <12 x i32> [[TMP15]], <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = srem <4 x i32> [[TMP16]], splat (i32 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = srem <8 x i32> [[TMP21]], <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_US_I:.*]]
 ; CHECK:       [[FOR_COND1_PREHEADER_US_I]]:
 ; CHECK-NEXT:    [[A_PROMOTED253537_US_I:%.*]] = phi i32 [ [[OP_RDX8:%.*]], %[[FOR_COND1_PREHEADER_US_I]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[RDX_OP:%.*]] = add <4 x i32> [[TMP22]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> [[TMP23]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP24]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[OP_RDX8]] = add i32 [[OP_RDX]], 0
 ; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_US_I]]

>From 95036d93f5bd798b2f8b5a6bb88326ef0f835bf9 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Thu, 2 Oct 2025 10:39:33 -0700
Subject: [PATCH 07/11] update test.

---
 .../Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 968e61fb5f9e0..12725e7d46273 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 
-; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -passes=slp-vectorizer -S < %s | FileCheck %s
 
 define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-LABEL: define void @const_stride_1_no_reordering(
@@ -622,9 +622,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[GEP_L0]], i64 100, <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0

>From c2cf69267c5364294bab68b5de57327d50b4238f Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 8 Oct 2025 13:47:40 -0700
Subject: [PATCH 08/11] added / improved comments, added `const`s

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3328fdc78246f..24b52b8fd313c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2248,9 +2248,10 @@ class BoUpSLP {
   /// Return true if an array of scalar loads can be replaced with a strided
   ///  load (with constant stride).
   ///
-  ///  It is possible that the load gets "widened". Suppose that originally each
-  ///  load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
-  ///  constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
+  ///  It is possible that the load gets "widened". Suppose that originally each load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is constant):
+  ///  %b + 0 * %s + 0
+  ///  %b + 0 * %s + 1
+  ///  %b + 0 * %s + 2
   ///  ...
   ///  %b + 0 * %s + (w - 1)
   ///
@@ -2272,10 +2273,8 @@ class BoUpSLP {
   /// \param PointerOps list of pointer arguments of loads.
   /// \param ElemTy original scalar type of loads.
   /// \param Alignment alignment of the first load.
-  /// \param SortedIndices is the order of PointerOps as returned by
-  /// `sortPtrAccesses`
-  /// \param Diff Pointer difference between the lowest and the highes pointer
-  /// in `PointerOps` as returned by `getPointersDiff`.
+  /// \param SortedIndices is the order of PointerOps as returned by `sortPtrAccesses`
+  /// \param Diff Pointer difference between the lowest and the highes pointer in `PointerOps` as returned by `getPointersDiff`.
   /// \param Ptr0 first pointer in `PointersOps`.
   /// \param PtrN last pointer in `PointersOps`.
   /// \param SPtrInfo If the function return `true`, it also sets all the fields
@@ -6921,7 +6920,7 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
 
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
-    const SmallVectorImpl<unsigned> &SortedIndices, int64_t Diff, Value *Ptr0,
+    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, Value *Ptr0,
     Value *PtrN, StridedPtrInfo &SPtrInfo) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;

>From a323be91f99f1d3e83aa3203078713fcd25e503a Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 10 Oct 2025 16:13:10 -0700
Subject: [PATCH 09/11] more comments, simplifications, more `const`

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 41 ++++++++++++-------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 24b52b8fd313c..a95c3234db36f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6923,8 +6923,8 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, Value *Ptr0,
     Value *PtrN, StridedPtrInfo &SPtrInfo) const {
   const unsigned Sz = PointerOps.size();
-  SmallVector<int64_t> SortedOffsetsFromBase;
-  SortedOffsetsFromBase.resize(Sz);
+  SmallVector<int64_t> SortedOffsetsFromBase(Sz);
+  // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.
   for (unsigned I : seq<unsigned>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
@@ -6933,10 +6933,24 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   }
   assert(SortedOffsetsFromBase.size() > 1 &&
          "Trying to generate strided load for less than 2 loads");
-  //
-  // Find where the first group ends.
+  // The code below checks that `SortedOffsetsFromBase` looks as follows:
+  // ```
+  // [
+  //   (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
+  //   (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
+  //   ...
+  //   (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
+  //   GroupSize - 1}), // last group
+  // ]
+  // ```
+  // The distance between consecutive elements within each group should all be
+  // the same `StrideWithinGroup`. The distance between the first elements of
+  // consecutive groups should all be the same `StrideBetweenGroups`.
+
   int64_t StrideWithinGroup =
       SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  // Determine size of the first group. Later we will check that all other
+  // groups have the same size.
   unsigned GroupSize = 1;
   for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
     if (SortedOffsetsFromBase[GroupSize] -
@@ -6949,6 +6963,8 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   int64_t StrideIntVal = StrideWithinGroup;
   FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
 
+  // Quick detour: at this point we can say what the type of strided load would
+  // be if all the checks pass. Check if this type is legal for the target.
   bool NeedsWidening = Sz != GroupSize;
   if (NeedsWidening) {
     if (Sz % GroupSize != 0)
@@ -6968,24 +6984,21 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     return false;
 
   if (NeedsWidening) {
-    unsigned PrevGroupStartIdx = 0;
+    // Continue with checking the "shape" of `SortedOffsetsFromBase`.
+    // Check that the strides between groups are all the same.
     unsigned CurrentGroupStartIdx = GroupSize;
     int64_t StrideBetweenGroups =
         SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
     StrideIntVal = StrideBetweenGroups;
-    while (CurrentGroupStartIdx != Sz) {
+    for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
       if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
-              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+              SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
           StrideBetweenGroups)
-        break;
-      PrevGroupStartIdx = CurrentGroupStartIdx;
-      CurrentGroupStartIdx += GroupSize;
+        return false;
     }
-    if (CurrentGroupStartIdx != Sz)
-      return false;
 
-    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
-                          int64_t StrideWithinGroup) -> bool {
+    auto CheckGroup = [&](const unsigned StartIdx, const unsigned GroupSize0,
+                          const int64_t StrideWithinGroup) -> bool {
       unsigned GroupEndIdx = StartIdx + 1;
       for (; GroupEndIdx != Sz; ++GroupEndIdx) {
         if (SortedOffsetsFromBase[GroupEndIdx] -

>From eab17486450be0018f214afa50bd892089faef1d Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 14 Oct 2025 11:03:00 -0700
Subject: [PATCH 10/11] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a95c3234db36f..0afb965074144 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2248,10 +2248,9 @@ class BoUpSLP {
   /// Return true if an array of scalar loads can be replaced with a strided
   ///  load (with constant stride).
   ///
-  ///  It is possible that the load gets "widened". Suppose that originally each load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is constant):
-  ///  %b + 0 * %s + 0
-  ///  %b + 0 * %s + 1
-  ///  %b + 0 * %s + 2
+  ///  It is possible that the load gets "widened". Suppose that originally each
+  ///  load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
+  ///  constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
   ///  ...
   ///  %b + 0 * %s + (w - 1)
   ///
@@ -2273,8 +2272,10 @@ class BoUpSLP {
   /// \param PointerOps list of pointer arguments of loads.
   /// \param ElemTy original scalar type of loads.
   /// \param Alignment alignment of the first load.
-  /// \param SortedIndices is the order of PointerOps as returned by `sortPtrAccesses`
-  /// \param Diff Pointer difference between the lowest and the highes pointer in `PointerOps` as returned by `getPointersDiff`.
+  /// \param SortedIndices is the order of PointerOps as returned by
+  /// `sortPtrAccesses`
+  /// \param Diff Pointer difference between the lowest and the highes pointer
+  /// in `PointerOps` as returned by `getPointersDiff`.
   /// \param Ptr0 first pointer in `PointersOps`.
   /// \param PtrN last pointer in `PointersOps`.
   /// \param SPtrInfo If the function return `true`, it also sets all the fields

>From fff92b641ddba5239c4bfab39c9c4a68be97a851 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 17 Oct 2025 10:56:34 -0700
Subject: [PATCH 11/11] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0afb965074144..8e96096f2a655 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6921,8 +6921,8 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
 
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
-    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, Value *Ptr0,
-    Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+    Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase(Sz);
   // Go through `PointerOps` in sorted order and record offsets from `Ptr0`.