[llvm] [SLPVectorizer] Widen strided loads. (PR #153074)

Fri Aug 15 06:59:43 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/153074

>From 47fdb36e6bc5c722c5b4f14c05825c3d504dc628 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 8 Aug 2025 07:52:29 -0700
Subject: [PATCH 01/12] [SLPVectorizer] Widen strided loads.

Currently SLPVectorizer can generate strided loads only for this
pattern:

```
load %base + 0 * %stride
load %base + 1 * %stride
...
load %base + n * %stride
```
In this PR we extend it to this pattern:

```
; load w consecutive elements starting at %base
load %base + 0 * %stride + 0
load %base + 0 * %stride + 1
load %base + 0 * %stride + 2
...
load %base + 0 * %stride + (w - 1)

; load w consecutive elements starting at %base + 1 * %stride
load %base + 1 * %stride + 0
load %base + 1 * %stride + 1
load %base + 1 * %stride + 2
...
load %base + 1 * %stride + (w - 1)
...
; load w consecutive elements starting at %base + n * %stride
load %base + n * %stride
load %base + n * %stride + 0
load %base + n * %stride + 1
load %base + n * %stride + 2
...
load %base + n * %stride + (w - 1)
```
This works for both run-time and constant strides.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   9 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   2 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 538 +++++++++++++-----
 .../SLPVectorizer/RISCV/x264-satd-8x4.ll      | 483 ++++++++++++++++
 4 files changed, 873 insertions(+), 159 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index af78b3cc2c7ff..4d43cb7ec0300 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+    "riscv-v-slp-prefer-alt-opc-vectorization",
+    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3018,3 +3023,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..254908f97186c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
 
-  bool preferAlternateOpcodeVectorization() const override { return false; }
+  bool preferAlternateOpcodeVectorization() const override;
 
   bool preferEpilogueVectorization() const override {
     // Epilogue vectorization is usually unprofitable - tail folding or
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1123063d878b..fbdb77a813866 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,6 +1918,21 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
+  // If we decide to generate strided load / store, this struct contains all the
+  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+  // SCEV or as a Value if it already exists.
+  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+  // has to by multiplied by the size of element of FixedVectorType.
+  struct StridedPtrInfo {
+    Value *StrideVal = nullptr;
+    const SCEV *StrideSCEV = nullptr;
+    // Represents the ammount which needs to be added to the base pointer of
+    // strided load.
+    FixedVectorType *Ty = nullptr;
+  };
+  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
 public:
   /// Tracks the state we can represent the loads in the given sequence.
   enum class LoadsState {
@@ -2078,6 +2093,7 @@ class BoUpSLP {
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2214,6 +2230,17 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                Align CommonAlignment,
+                                SmallVectorImpl<unsigned> &SortedIndices,
+                                StridedPtrInfo *SPtrInfo) const;
+
+  bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+                                      Type *ElemTy, Align CommonAlignment,
+                                      SmallVectorImpl<unsigned> &SortedIndices,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      Value *Ptr0, Value *PtrN) const;
+
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
   /// \param VL list of loads.
@@ -2227,6 +2254,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               StridedPtrInfo *SPtrInfo = nullptr,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -4471,11 +4499,10 @@ class BoUpSLP {
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      const InstructionsState &S, ArrayRef<Value *> VL,
+      bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6319,18 +6346,12 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
   });
 }
 
-/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
-                  const DataLayout &DL, ScalarEvolution &SE,
-                  SmallVectorImpl<unsigned> &SortedIndices,
-                  Instruction *Inst = nullptr) {
+/// Returns a SCEV expression for the stride if PointerOps is a set of strided
+/// pointers, or nullptr otherwise.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                     const DataLayout &DL, ScalarEvolution &SE,
+                                     SmallVectorImpl<unsigned> &SortedIndices,
+                                     SmallVectorImpl<int64_t> &Coeffs) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
@@ -6339,7 +6360,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   for (Value *Ptr : PointerOps) {
     const SCEV *PtrSCEV = SE.getSCEV(Ptr);
     if (!PtrSCEV)
-      return std::nullopt;
+      return nullptr;
     SCEVs.push_back(PtrSCEV);
     if (!PtrSCEVLowest && !PtrSCEVHighest) {
       PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6347,14 +6368,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     }
     const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
     if (isa<SCEVCouldNotCompute>(Diff))
-      return std::nullopt;
+      return nullptr;
     if (Diff->isNonConstantNegative()) {
       PtrSCEVLowest = PtrSCEV;
       continue;
     }
     const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
     if (isa<SCEVCouldNotCompute>(Diff1))
-      return std::nullopt;
+      return nullptr;
     if (Diff1->isNonConstantNegative()) {
       PtrSCEVHighest = PtrSCEV;
       continue;
@@ -6363,7 +6384,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   // Dist = PtrSCEVHighest - PtrSCEVLowest;
   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
   if (isa<SCEVCouldNotCompute>(Dist))
-    return std::nullopt;
+    return nullptr;
   int Size = DL.getTypeStoreSize(ElemTy);
   auto TryGetStride = [&](const SCEV *Dist,
                           const SCEV *Multiplier) -> const SCEV * {
@@ -6384,10 +6405,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
     Stride = TryGetStride(Dist, Sz);
     if (!Stride)
-      return std::nullopt;
+      return nullptr;
   }
   if (!Stride || isa<SCEVConstant>(Stride))
-    return std::nullopt;
+    return nullptr;
   // Iterate through all pointers and check if all distances are
   // unique multiple of Stride.
   using DistOrdPair = std::pair<int64_t, int>;
@@ -6401,42 +6422,184 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
       const SCEV *Coeff = TryGetStride(Diff, Stride);
       if (!Coeff)
-        return std::nullopt;
+        return nullptr;
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
-        return std::nullopt;
+        return nullptr;
+      Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
-        return std::nullopt;
+        return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    }
+    } else
+      Coeffs.push_back(0);
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
-      return std::nullopt;
+      return nullptr;
     auto Res = Offsets.emplace(Dist, Cnt);
     if (!Res.second)
-      return std::nullopt;
+      return nullptr;
     // Consecutive order if the inserted element is the last one.
     IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
     ++Cnt;
   }
   if (Offsets.size() != SCEVs.size())
-    return std::nullopt;
+    return nullptr;
   SortedIndices.clear();
-  if (!IsConsecutive) {
-    // Fill SortedIndices array only if it is non-consecutive.
-    SortedIndices.resize(PointerOps.size());
-    Cnt = 0;
-    for (const std::pair<int64_t, int> &Pair : Offsets) {
-      SortedIndices[Cnt] = Pair.second;
-      ++Cnt;
+  SortedIndices.resize(PointerOps.size());
+  Cnt = 0;
+  for (const std::pair<int64_t, int> &Pair : Offsets) {
+    SortedIndices[Cnt] = Pair.second;
+    ++Cnt;
+  }
+  return Stride;
+}
+
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+//  %b + 0 * %s + 0
+//  %b + 0 * %s + 1
+//  %b + 0 * %s + 2
+//  ...
+//  %b + 0 * %s + w
+//
+//  %b + 1 * %s + 0
+//  %b + 1 * %s + 1
+//  %b + 1 * %s + 2
+//  ...
+//  %b + 1 * %s + w
+//  ...
+//
+//  If the pointers can be rearanged in the above pattern, it means that the
+//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+                                       Type *ElemTy, Align CommonAlignment,
+                                       SmallVectorImpl<unsigned> &SortedIndices,
+                                       StridedPtrInfo *SPtrInfo) const {
+  // Group the pointers by constant offset.
+  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+      OffsetToPointerOpIdxMap;
+  for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+    const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+    if (!PtrSCEV)
+      return false;
+
+    const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    int64_t Offset = 0;
+    if (Add) {
+      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+        if (!SC)
+          continue;
+        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+        break;
+      }
     }
+    OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+    OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
   }
-  if (!Inst)
-    return nullptr;
-  SCEVExpander Expander(SE, DL, "strided-load-vec");
-  return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+  int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+  const unsigned Sz = PointerOps.size();
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  if (NumOffsets > 1) {
+    if (Sz % NumOffsets != 0)
+      return false;
+    VecSz = Sz / NumOffsets;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   NumOffsets);
+  }
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  SmallVector<int> SortedOffsetsV;
+  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+    SortedOffsetsV.push_back(K);
+  }
+  llvm::sort(SortedOffsetsV);
+  if (NumOffsets > 1) {
+    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if ((CommonDiff) != 1)
+      return false;
+    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+        return false;
+    }
+  }
+
+  int64_t LowestOffset = SortedOffsetsV[0];
+  SmallVector<Value *> &PointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].first;
+  SmallVector<unsigned> &IndicesInAllPointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].second;
+
+  SmallVector<int64_t> Coeffs0;
+  SmallVector<unsigned> SortedIndicesForOffset0;
+  const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+                                          SortedIndicesForOffset0, Coeffs0);
+  if (!Stride0)
+    return false;
+  unsigned NumCoeffs0 = Coeffs0.size();
+  if (NumCoeffs0 * NumOffsets != Sz)
+    return false;
+  llvm::sort(Coeffs0);
+
+  SmallVector<unsigned> SortedIndicesDraft;
+  SortedIndicesDraft.resize(Sz);
+  auto updateSortedIndices =
+      [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+          SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+          int64_t OffsetNum) {
+        unsigned Num = 0;
+        for (unsigned Idx : SortedIndicesForOffset) {
+          SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+              IndicesInAllPointerOps[Idx];
+          ++Num;
+        }
+      };
+
+  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+  SmallVector<int64_t> Coeffs;
+  SmallVector<unsigned> SortedIndicesForOffset;
+  for (int i = 1; i < NumOffsets; ++i) {
+    Coeffs.clear();
+    SortedIndicesForOffset.clear();
+
+    int64_t Offset = SortedOffsetsV[i];
+    SmallVector<Value *> &PointerOpsForOffset =
+        OffsetToPointerOpIdxMap[Offset].first;
+    SmallVector<unsigned> &IndicesInAllPointerOps =
+        OffsetToPointerOpIdxMap[Offset].second;
+    const SCEV *StrideWithinGroup = calculateRtStride(
+        PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+      return false;
+    }
+    if (Coeffs.size() != NumCoeffs0)
+      return false;
+    llvm::sort(Coeffs);
+    for (unsigned i = 0; i < NumCoeffs0; ++i) {
+      if (Coeffs[i] != Coeffs0[i])
+        return false;
+    }
+
+    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+  }
+
+  SortedIndices.clear();
+  SortedIndices = SortedIndicesDraft;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
+  return true;
 }
 
 static std::pair<InstructionCost, InstructionCost>
@@ -6766,77 +6929,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                          ArrayRef<unsigned> Order,
-                          const TargetTransformInfo &TTI, const DataLayout &DL,
-                          ScalarEvolution &SE,
-                          const bool IsAnyPointerUsedOutGraph,
-                          const int64_t Diff) {
-  const size_t Sz = VL.size();
-  const uint64_t AbsoluteDiff = std::abs(Diff);
-  Type *ScalarTy = VL.front()->getType();
-  auto *VecTy = getWidenedType(ScalarTy, Sz);
-  if (IsAnyPointerUsedOutGraph ||
-      (AbsoluteDiff > Sz &&
-       (Sz > MinProfitableStridedLoads ||
-        (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-         AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
-      Diff == -(static_cast<int64_t>(Sz) - 1)) {
-    int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
-    if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    int64_t Diff, Value *Ptr0, Value *PtrN) const {
+  const unsigned Sz = PointerOps.size();
+  SmallVector<int64_t> SortedOffsetsFromBase;
+  SortedOffsetsFromBase.resize(Sz);
+  for (unsigned i = 0; i < Sz; ++i) {
+    Value *Ptr =
+        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+    SortedOffsetsFromBase[i] =
+        *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+  }
+
+  // Find where the first group ends.
+  assert(SortedOffsetsFromBase.size() > 1);
+  int64_t StrideWithinGroup =
+      SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  unsigned GroupSize = 1;
+  for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+    if (SortedOffsetsFromBase[GroupSize] -
+            SortedOffsetsFromBase[GroupSize - 1] !=
+        StrideWithinGroup)
+      break;
+  }
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  int64_t StrideIntVal = StrideWithinGroup;
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+  if (Sz != GroupSize) {
+    if (Sz % GroupSize != 0)
       return false;
-    Align Alignment =
-        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-            ->getAlign();
-    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+    VecSz = Sz / GroupSize;
+
+    if (StrideWithinGroup != 1)
       return false;
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
-    } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
-    }
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+    unsigned VecSz = Sz / GroupSize;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   GroupSize);
+    StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+    if (!TTI->isTypeLegal(StridedLoadTy) ||
+        !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+      return false;
+
+    unsigned PrevGroupStartIdx = 0;
+    unsigned CurrentGroupStartIdx = GroupSize;
+    int64_t StrideBetweenGroups =
+        SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+    StrideIntVal = StrideBetweenGroups;
+    while (CurrentGroupStartIdx != Sz) {
+      if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+          StrideBetweenGroups)
         break;
+      PrevGroupStartIdx = CurrentGroupStartIdx;
+      CurrentGroupStartIdx += GroupSize;
+    }
+    if (CurrentGroupStartIdx != Sz)
+      return false;
+
+    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+                          int64_t StrideWithinGroup) -> bool {
+      unsigned GroupEndIdx = StartIdx + 1;
+      for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+        if (SortedOffsetsFromBase[GroupEndIdx] -
+                SortedOffsetsFromBase[GroupEndIdx - 1] !=
+            StrideWithinGroup)
+          break;
+      }
+      return (GroupEndIdx - StartIdx == GroupSize0);
+    };
+    for (unsigned i = 0; i < Sz; i += GroupSize) {
+      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+        return false;
     }
-    if (Dists.size() == Sz)
-      return true;
+  }
+
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  // 4. Any pointer operand is an instruction with the users outside of the
+  // current graph (for masked gathers extra extractelement instructions
+  // might be required).
+
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  // Simple check if not a strided access - clear order.
+  bool IsPossibleStrided = Diff % (VecSz - 1) == 0;
+  auto IsAnyPointerUsedOutGraph =
+      IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
+        return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
+                 return !isVectorized(U) && !MustGather.contains(U);
+               });
+      });
+  const unsigned AbsoluteDiff = std::abs(Diff);
+  if (IsAnyPointerUsedOutGraph ||
+      ((VecSz > MinProfitableStridedLoads ||
+        (AbsoluteDiff <= MaxProfitableLoadStride * VecSz &&
+         has_single_bit(AbsoluteDiff))) &&
+       AbsoluteDiff > VecSz) ||
+      Diff == -(static_cast<int>(VecSz) - 1)) {
+    if (SPtrInfo) {
+      Type *StrideTy = DL->getIndexType(Ptr0->getType());
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
+    return true;
   }
   return false;
 }
 
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                           SmallVectorImpl<unsigned> &Order,
-                           SmallVectorImpl<Value *> &PointerOps,
-                           unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo,
+    unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6873,11 +7092,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
-    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
-        return LoadsState::StridedVectorize;
-    }
+    if (Sz > MinProfitableStridedLoads &&
+        analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
+                                 SPtrInfo))
+      return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
         TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -6910,17 +7128,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                              }))
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
-    bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-    // Try to generate strided load node.
-    auto IsAnyPointerUsedOutGraph =
-        IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
-          return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
-                   return !isVectorized(U) && !MustGather.contains(U);
-                 });
-        });
-    if (IsPossibleStrided &&
-        isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
-                      IsAnyPointerUsedOutGraph, *Diff))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7004,9 +7213,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
-        LoadsState LS =
-            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
-                              /*TryRecursiveCheck=*/false);
+        LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+                                          PointerOps, SPtrInfo, BestVF,
+                                          /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
         if (LS == LoadsState::Gather) {
           if (BestVF) {
@@ -9184,8 +9393,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
-          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
-                                            PointerOps, &BestVF);
+          LoadsState LS =
+              canVectorizeLoads(Values, Slice.front(), CurrentOrder, PointerOps,
+                                /*SPtrInfo =*/nullptr, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9790,7 +10000,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -9892,7 +10102,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11351,8 +11561,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
+  StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -11512,6 +11723,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                    TE->dump());
         break;
@@ -13007,10 +13219,20 @@ void BoUpSLP::transformNodes() {
         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
-        if (StridedCost < OriginalVecCost)
+        if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
+          // TODO: StrideTy =
+          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          StridedPtrInfo SPtrInfo;
+          Type *StrideTy = Type::getIntNTy(
+              SE->getContext(),
+              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
+          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+          SPtrInfo.Ty = VecTy;
+          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
           E.State = TreeEntry::StridedVectorize;
+        }
       }
       break;
     }
@@ -19468,6 +19690,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
+      FixedVectorType *StridedLoadTy = nullptr;
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19505,40 +19728,37 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
         PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int64_t> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        if (Diff) {
-          int64_t Stride =
-              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
-          StrideVal =
-              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                             DL->getTypeAllocSize(ScalarTy));
-        } else {
-          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
-          transform(E->Scalars, PointerOps.begin(), [](Value *V) {
-            return cast<LoadInst>(V)->getPointerOperand();
-          });
-          OrdersType Order;
-          std::optional<Value *> Stride =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
-                                &*Builder.GetInsertPoint());
-          Value *NewStride =
-              Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
-          StrideVal = Builder.CreateMul(
-              NewStride,
-              ConstantInt::get(
-                  StrideTy,
-                  (IsReverseOrder ? -1 : 1) *
-                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
-        }
+
+        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy);
+        unsigned StridedLoadEC =
+            StridedLoadTy->getElementCount().getKnownMinValue();
+
+        Value *Stride = SPtrInfo.StrideVal;
+        if (!Stride) {
+          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+          assert(StrideSCEV);
+          SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                          &*Builder.GetInsertPoint());
+        }
+        Value *NewStride =
+            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+        StrideVal = Builder.CreateMul(
+            NewStride, ConstantInt::get(
+                           StrideTy, (IsReverseOrder ? -1 : 1) *
+                                         static_cast<int>(
+                                             DL->getTypeAllocSize(ScalarTy))));
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         auto *Inst = Builder.CreateIntrinsic(
             Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
+            {StridedLoadTy, PO->getType(), StrideTy},
+            {PO, StrideVal,
+             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+             Builder.getInt32(StridedLoadEC)});
         Inst->addParamAttr(
             /*ArgNo=*/0,
             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
@@ -19575,6 +19795,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                      ? NewLI
                      : ::propagateMetadata(NewLI, E->Scalars);
 
+      if (StridedLoadTy)
+        V = Builder.CreateBitOrPointerCast(V, VecTy);
       V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
new file mode 100644
index 0000000000000..38cf1214081fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
+; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
+; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
+; Function Attrs: nounwind uwtable vscale_range(8,1024)
+define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {
+; CHECK-LABEL: define i32 @x264_pixel_satd_8x4(
+; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)
+; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]]
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP73]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP73]], 16
+; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %idx.ext = sext i32 %i_pix1 to i64
+  %idx.ext63 = sext i32 %i_pix2 to i64
+  %0 = load i8, ptr %pix1, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %pix2, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4
+  %2 = load i8, ptr %arrayidx3, align 1
+  %conv4 = zext i8 %2 to i32
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4
+  %3 = load i8, ptr %arrayidx5, align 1
+  %conv6 = zext i8 %3 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl nsw i32 %sub7, 16
+  %add = add nsw i32 %shl, %sub
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1
+  %4 = load i8, ptr %arrayidx8, align 1
+  %conv9 = zext i8 %4 to i32
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1
+  %5 = load i8, ptr %arrayidx10, align 1
+  %conv11 = zext i8 %5 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5
+  %6 = load i8, ptr %arrayidx13, align 1
+  %conv14 = zext i8 %6 to i32
+  %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5
+  %7 = load i8, ptr %arrayidx15, align 1
+  %conv16 = zext i8 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl nsw i32 %sub17, 16
+  %add19 = add nsw i32 %shl18, %sub12
+  %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2
+  %8 = load i8, ptr %arrayidx20, align 1
+  %conv21 = zext i8 %8 to i32
+  %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2
+  %9 = load i8, ptr %arrayidx22, align 1
+  %conv23 = zext i8 %9 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6
+  %10 = load i8, ptr %arrayidx25, align 1
+  %conv26 = zext i8 %10 to i32
+  %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6
+  %11 = load i8, ptr %arrayidx27, align 1
+  %conv28 = zext i8 %11 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl nsw i32 %sub29, 16
+  %add31 = add nsw i32 %shl30, %sub24
+  %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3
+  %12 = load i8, ptr %arrayidx32, align 1
+  %conv33 = zext i8 %12 to i32
+  %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3
+  %13 = load i8, ptr %arrayidx34, align 1
+  %conv35 = zext i8 %13 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7
+  %14 = load i8, ptr %arrayidx37, align 1
+  %conv38 = zext i8 %14 to i32
+  %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7
+  %15 = load i8, ptr %arrayidx39, align 1
+  %conv40 = zext i8 %15 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl nsw i32 %sub41, 16
+  %add43 = add nsw i32 %shl42, %sub36
+  %add44 = add nsw i32 %add19, %add
+  %sub45 = sub nsw i32 %add, %add19
+  %add46 = add nsw i32 %add43, %add31
+  %sub47 = sub nsw i32 %add31, %add43
+  %add48 = add nsw i32 %add46, %add44
+  %sub51 = sub nsw i32 %add44, %add46
+  %add55 = add nsw i32 %sub47, %sub45
+  %sub59 = sub nsw i32 %sub45, %sub47
+  %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext
+  %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63
+  %16 = load i8, ptr %add.ptr, align 1
+  %conv.1 = zext i8 %16 to i32
+  %17 = load i8, ptr %add.ptr64, align 1
+  %conv2.1 = zext i8 %17 to i32
+  %sub.1 = sub nsw i32 %conv.1, %conv2.1
+  %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4
+  %18 = load i8, ptr %arrayidx3.1, align 1
+  %conv4.1 = zext i8 %18 to i32
+  %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4
+  %19 = load i8, ptr %arrayidx5.1, align 1
+  %conv6.1 = zext i8 %19 to i32
+  %sub7.1 = sub nsw i32 %conv4.1, %conv6.1
+  %shl.1 = shl nsw i32 %sub7.1, 16
+  %add.1 = add nsw i32 %shl.1, %sub.1
+  %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1
+  %20 = load i8, ptr %arrayidx8.1, align 1
+  %conv9.1 = zext i8 %20 to i32
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1
+  %21 = load i8, ptr %arrayidx10.1, align 1
+  %conv11.1 = zext i8 %21 to i32
+  %sub12.1 = sub nsw i32 %conv9.1, %conv11.1
+  %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5
+  %22 = load i8, ptr %arrayidx13.1, align 1
+  %conv14.1 = zext i8 %22 to i32
+  %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5
+  %23 = load i8, ptr %arrayidx15.1, align 1
+  %conv16.1 = zext i8 %23 to i32
+  %sub17.1 = sub nsw i32 %conv14.1, %conv16.1
+  %shl18.1 = shl nsw i32 %sub17.1, 16
+  %add19.1 = add nsw i32 %shl18.1, %sub12.1
+  %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2
+  %24 = load i8, ptr %arrayidx20.1, align 1
+  %conv21.1 = zext i8 %24 to i32
+  %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2
+  %25 = load i8, ptr %arrayidx22.1, align 1
+  %conv23.1 = zext i8 %25 to i32
+  %sub24.1 = sub nsw i32 %conv21.1, %conv23.1
+  %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6
+  %26 = load i8, ptr %arrayidx25.1, align 1
+  %conv26.1 = zext i8 %26 to i32
+  %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6
+  %27 = load i8, ptr %arrayidx27.1, align 1
+  %conv28.1 = zext i8 %27 to i32
+  %sub29.1 = sub nsw i32 %conv26.1, %conv28.1
+  %shl30.1 = shl nsw i32 %sub29.1, 16
+  %add31.1 = add nsw i32 %shl30.1, %sub24.1
+  %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3
+  %28 = load i8, ptr %arrayidx32.1, align 1
+  %conv33.1 = zext i8 %28 to i32
+  %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3
+  %29 = load i8, ptr %arrayidx34.1, align 1
+  %conv35.1 = zext i8 %29 to i32
+  %sub36.1 = sub nsw i32 %conv33.1, %conv35.1
+  %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7
+  %30 = load i8, ptr %arrayidx37.1, align 1
+  %conv38.1 = zext i8 %30 to i32
+  %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7
+  %31 = load i8, ptr %arrayidx39.1, align 1
+  %conv40.1 = zext i8 %31 to i32
+  %sub41.1 = sub nsw i32 %conv38.1, %conv40.1
+  %shl42.1 = shl nsw i32 %sub41.1, 16
+  %add43.1 = add nsw i32 %shl42.1, %sub36.1
+  %add44.1 = add nsw i32 %add19.1, %add.1
+  %sub45.1 = sub nsw i32 %add.1, %add19.1
+  %add46.1 = add nsw i32 %add43.1, %add31.1
+  %sub47.1 = sub nsw i32 %add31.1, %add43.1
+  %add48.1 = add nsw i32 %add46.1, %add44.1
+  %sub51.1 = sub nsw i32 %add44.1, %add46.1
+  %add55.1 = add nsw i32 %sub47.1, %sub45.1
+  %sub59.1 = sub nsw i32 %sub45.1, %sub47.1
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+  %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63
+  %32 = load i8, ptr %add.ptr.1, align 1
+  %conv.2 = zext i8 %32 to i32
+  %33 = load i8, ptr %add.ptr64.1, align 1
+  %conv2.2 = zext i8 %33 to i32
+  %sub.2 = sub nsw i32 %conv.2, %conv2.2
+  %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4
+  %34 = load i8, ptr %arrayidx3.2, align 1
+  %conv4.2 = zext i8 %34 to i32
+  %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4
+  %35 = load i8, ptr %arrayidx5.2, align 1
+  %conv6.2 = zext i8 %35 to i32
+  %sub7.2 = sub nsw i32 %conv4.2, %conv6.2
+  %shl.2 = shl nsw i32 %sub7.2, 16
+  %add.2 = add nsw i32 %shl.2, %sub.2
+  %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1
+  %36 = load i8, ptr %arrayidx8.2, align 1
+  %conv9.2 = zext i8 %36 to i32
+  %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1
+  %37 = load i8, ptr %arrayidx10.2, align 1
+  %conv11.2 = zext i8 %37 to i32
+  %sub12.2 = sub nsw i32 %conv9.2, %conv11.2
+  %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5
+  %38 = load i8, ptr %arrayidx13.2, align 1
+  %conv14.2 = zext i8 %38 to i32
+  %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5
+  %39 = load i8, ptr %arrayidx15.2, align 1
+  %conv16.2 = zext i8 %39 to i32
+  %sub17.2 = sub nsw i32 %conv14.2, %conv16.2
+  %shl18.2 = shl nsw i32 %sub17.2, 16
+  %add19.2 = add nsw i32 %shl18.2, %sub12.2
+  %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2
+  %40 = load i8, ptr %arrayidx20.2, align 1
+  %conv21.2 = zext i8 %40 to i32
+  %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2
+  %41 = load i8, ptr %arrayidx22.2, align 1
+  %conv23.2 = zext i8 %41 to i32
+  %sub24.2 = sub nsw i32 %conv21.2, %conv23.2
+  %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6
+  %42 = load i8, ptr %arrayidx25.2, align 1
+  %conv26.2 = zext i8 %42 to i32
+  %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6
+  %43 = load i8, ptr %arrayidx27.2, align 1
+  %conv28.2 = zext i8 %43 to i32
+  %sub29.2 = sub nsw i32 %conv26.2, %conv28.2
+  %shl30.2 = shl nsw i32 %sub29.2, 16
+  %add31.2 = add nsw i32 %shl30.2, %sub24.2
+  %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3
+  %44 = load i8, ptr %arrayidx32.2, align 1
+  %conv33.2 = zext i8 %44 to i32
+  %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3
+  %45 = load i8, ptr %arrayidx34.2, align 1
+  %conv35.2 = zext i8 %45 to i32
+  %sub36.2 = sub nsw i32 %conv33.2, %conv35.2
+  %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7
+  %46 = load i8, ptr %arrayidx37.2, align 1
+  %conv38.2 = zext i8 %46 to i32
+  %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7
+  %47 = load i8, ptr %arrayidx39.2, align 1
+  %conv40.2 = zext i8 %47 to i32
+  %sub41.2 = sub nsw i32 %conv38.2, %conv40.2
+  %shl42.2 = shl nsw i32 %sub41.2, 16
+  %add43.2 = add nsw i32 %shl42.2, %sub36.2
+  %add44.2 = add nsw i32 %add19.2, %add.2
+  %sub45.2 = sub nsw i32 %add.2, %add19.2
+  %add46.2 = add nsw i32 %add43.2, %add31.2
+  %sub47.2 = sub nsw i32 %add31.2, %add43.2
+  %add48.2 = add nsw i32 %add46.2, %add44.2
+  %sub51.2 = sub nsw i32 %add44.2, %add46.2
+  %add55.2 = add nsw i32 %sub47.2, %sub45.2
+  %sub59.2 = sub nsw i32 %sub45.2, %sub47.2
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+  %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63
+  %48 = load i8, ptr %add.ptr.2, align 1
+  %conv.3 = zext i8 %48 to i32
+  %49 = load i8, ptr %add.ptr64.2, align 1
+  %conv2.3 = zext i8 %49 to i32
+  %sub.3 = sub nsw i32 %conv.3, %conv2.3
+  %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4
+  %50 = load i8, ptr %arrayidx3.3, align 1
+  %conv4.3 = zext i8 %50 to i32
+  %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4
+  %51 = load i8, ptr %arrayidx5.3, align 1
+  %conv6.3 = zext i8 %51 to i32
+  %sub7.3 = sub nsw i32 %conv4.3, %conv6.3
+  %shl.3 = shl nsw i32 %sub7.3, 16
+  %add.3 = add nsw i32 %shl.3, %sub.3
+  %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1
+  %52 = load i8, ptr %arrayidx8.3, align 1
+  %conv9.3 = zext i8 %52 to i32
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1
+  %53 = load i8, ptr %arrayidx10.3, align 1
+  %conv11.3 = zext i8 %53 to i32
+  %sub12.3 = sub nsw i32 %conv9.3, %conv11.3
+  %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5
+  %54 = load i8, ptr %arrayidx13.3, align 1
+  %conv14.3 = zext i8 %54 to i32
+  %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5
+  %55 = load i8, ptr %arrayidx15.3, align 1
+  %conv16.3 = zext i8 %55 to i32
+  %sub17.3 = sub nsw i32 %conv14.3, %conv16.3
+  %shl18.3 = shl nsw i32 %sub17.3, 16
+  %add19.3 = add nsw i32 %shl18.3, %sub12.3
+  %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2
+  %56 = load i8, ptr %arrayidx20.3, align 1
+  %conv21.3 = zext i8 %56 to i32
+  %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2
+  %57 = load i8, ptr %arrayidx22.3, align 1
+  %conv23.3 = zext i8 %57 to i32
+  %sub24.3 = sub nsw i32 %conv21.3, %conv23.3
+  %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6
+  %58 = load i8, ptr %arrayidx25.3, align 1
+  %conv26.3 = zext i8 %58 to i32
+  %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6
+  %59 = load i8, ptr %arrayidx27.3, align 1
+  %conv28.3 = zext i8 %59 to i32
+  %sub29.3 = sub nsw i32 %conv26.3, %conv28.3
+  %shl30.3 = shl nsw i32 %sub29.3, 16
+  %add31.3 = add nsw i32 %shl30.3, %sub24.3
+  %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3
+  %60 = load i8, ptr %arrayidx32.3, align 1
+  %conv33.3 = zext i8 %60 to i32
+  %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3
+  %61 = load i8, ptr %arrayidx34.3, align 1
+  %conv35.3 = zext i8 %61 to i32
+  %sub36.3 = sub nsw i32 %conv33.3, %conv35.3
+  %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7
+  %62 = load i8, ptr %arrayidx37.3, align 1
+  %conv38.3 = zext i8 %62 to i32
+  %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7
+  %63 = load i8, ptr %arrayidx39.3, align 1
+  %conv40.3 = zext i8 %63 to i32
+  %sub41.3 = sub nsw i32 %conv38.3, %conv40.3
+  %shl42.3 = shl nsw i32 %sub41.3, 16
+  %add43.3 = add nsw i32 %shl42.3, %sub36.3
+  %add44.3 = add nsw i32 %add19.3, %add.3
+  %sub45.3 = sub nsw i32 %add.3, %add19.3
+  %add46.3 = add nsw i32 %add43.3, %add31.3
+  %sub47.3 = sub nsw i32 %add31.3, %add43.3
+  %add48.3 = add nsw i32 %add46.3, %add44.3
+  %sub51.3 = sub nsw i32 %add44.3, %add46.3
+  %add55.3 = add nsw i32 %sub47.3, %sub45.3
+  %sub59.3 = sub nsw i32 %sub45.3, %sub47.3
+  %add78 = add nsw i32 %add48.1, %add48
+  %sub86 = sub nsw i32 %add48, %add48.1
+  %add94 = add nsw i32 %add48.3, %add48.2
+  %sub102 = sub nsw i32 %add48.2, %add48.3
+  %add103 = add nsw i32 %add94, %add78
+  %sub104 = sub nsw i32 %add78, %add94
+  %add105 = add nsw i32 %sub102, %sub86
+  %sub106 = sub nsw i32 %sub86, %sub102
+  %shr.i = lshr i32 %add103, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %add103
+  %xor.i = xor i32 %add.i, %mul.i
+  %shr.i169 = lshr i32 %add105, 15
+  %and.i170 = and i32 %shr.i169, 65537
+  %mul.i171 = mul nuw i32 %and.i170, 65535
+  %add.i172 = add i32 %mul.i171, %add105
+  %xor.i173 = xor i32 %add.i172, %mul.i171
+  %shr.i174 = lshr i32 %sub104, 15
+  %and.i175 = and i32 %shr.i174, 65537
+  %mul.i176 = mul nuw i32 %and.i175, 65535
+  %add.i177 = add i32 %mul.i176, %sub104
+  %xor.i178 = xor i32 %add.i177, %mul.i176
+  %shr.i179 = lshr i32 %sub106, 15
+  %and.i180 = and i32 %shr.i179, 65537
+  %mul.i181 = mul nuw i32 %and.i180, 65535
+  %add.i182 = add i32 %mul.i181, %sub106
+  %xor.i183 = xor i32 %add.i182, %mul.i181
+  %add110 = add i32 %xor.i173, %xor.i
+  %add112 = add i32 %add110, %xor.i178
+  %add113 = add i32 %add112, %xor.i183
+  %add78.1 = add nsw i32 %add55.1, %add55
+  %sub86.1 = sub nsw i32 %add55, %add55.1
+  %add94.1 = add nsw i32 %add55.3, %add55.2
+  %sub102.1 = sub nsw i32 %add55.2, %add55.3
+  %add103.1 = add nsw i32 %add94.1, %add78.1
+  %sub104.1 = sub nsw i32 %add78.1, %add94.1
+  %add105.1 = add nsw i32 %sub102.1, %sub86.1
+  %sub106.1 = sub nsw i32 %sub86.1, %sub102.1
+  %shr.i.1 = lshr i32 %add103.1, 15
+  %and.i.1 = and i32 %shr.i.1, 65537
+  %mul.i.1 = mul nuw i32 %and.i.1, 65535
+  %add.i.1 = add i32 %mul.i.1, %add103.1
+  %xor.i.1 = xor i32 %add.i.1, %mul.i.1
+  %shr.i169.1 = lshr i32 %add105.1, 15
+  %and.i170.1 = and i32 %shr.i169.1, 65537
+  %mul.i171.1 = mul nuw i32 %and.i170.1, 65535
+  %add.i172.1 = add i32 %mul.i171.1, %add105.1
+  %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1
+  %shr.i174.1 = lshr i32 %sub104.1, 15
+  %and.i175.1 = and i32 %shr.i174.1, 65537
+  %mul.i176.1 = mul nuw i32 %and.i175.1, 65535
+  %add.i177.1 = add i32 %mul.i176.1, %sub104.1
+  %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1
+  %shr.i179.1 = lshr i32 %sub106.1, 15
+  %and.i180.1 = and i32 %shr.i179.1, 65537
+  %mul.i181.1 = mul nuw i32 %and.i180.1, 65535
+  %add.i182.1 = add i32 %mul.i181.1, %sub106.1
+  %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1
+  %add108.1 = add i32 %xor.i173.1, %add113
+  %add110.1 = add i32 %add108.1, %xor.i.1
+  %add112.1 = add i32 %add110.1, %xor.i178.1
+  %add113.1 = add i32 %add112.1, %xor.i183.1
+  %add78.2 = add nsw i32 %sub51.1, %sub51
+  %sub86.2 = sub nsw i32 %sub51, %sub51.1
+  %add94.2 = add nsw i32 %sub51.3, %sub51.2
+  %sub102.2 = sub nsw i32 %sub51.2, %sub51.3
+  %add103.2 = add nsw i32 %add94.2, %add78.2
+  %sub104.2 = sub nsw i32 %add78.2, %add94.2
+  %add105.2 = add nsw i32 %sub102.2, %sub86.2
+  %sub106.2 = sub nsw i32 %sub86.2, %sub102.2
+  %shr.i.2 = lshr i32 %add103.2, 15
+  %and.i.2 = and i32 %shr.i.2, 65537
+  %mul.i.2 = mul nuw i32 %and.i.2, 65535
+  %add.i.2 = add i32 %mul.i.2, %add103.2
+  %xor.i.2 = xor i32 %add.i.2, %mul.i.2
+  %shr.i169.2 = lshr i32 %add105.2, 15
+  %and.i170.2 = and i32 %shr.i169.2, 65537
+  %mul.i171.2 = mul nuw i32 %and.i170.2, 65535
+  %add.i172.2 = add i32 %mul.i171.2, %add105.2
+  %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2
+  %shr.i174.2 = lshr i32 %sub104.2, 15
+  %and.i175.2 = and i32 %shr.i174.2, 65537
+  %mul.i176.2 = mul nuw i32 %and.i175.2, 65535
+  %add.i177.2 = add i32 %mul.i176.2, %sub104.2
+  %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2
+  %shr.i179.2 = lshr i32 %sub106.2, 15
+  %and.i180.2 = and i32 %shr.i179.2, 65537
+  %mul.i181.2 = mul nuw i32 %and.i180.2, 65535
+  %add.i182.2 = add i32 %mul.i181.2, %sub106.2
+  %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2
+  %add108.2 = add i32 %xor.i173.2, %add113.1
+  %add110.2 = add i32 %add108.2, %xor.i.2
+  %add112.2 = add i32 %add110.2, %xor.i178.2
+  %add113.2 = add i32 %add112.2, %xor.i183.2
+  %add78.3 = add nsw i32 %sub59.1, %sub59
+  %sub86.3 = sub nsw i32 %sub59, %sub59.1
+  %add94.3 = add nsw i32 %sub59.3, %sub59.2
+  %sub102.3 = sub nsw i32 %sub59.2, %sub59.3
+  %add103.3 = add nsw i32 %add94.3, %add78.3
+  %sub104.3 = sub nsw i32 %add78.3, %add94.3
+  %add105.3 = add nsw i32 %sub102.3, %sub86.3
+  %sub106.3 = sub nsw i32 %sub86.3, %sub102.3
+  %shr.i.3 = lshr i32 %add103.3, 15
+  %and.i.3 = and i32 %shr.i.3, 65537
+  %mul.i.3 = mul nuw i32 %and.i.3, 65535
+  %add.i.3 = add i32 %mul.i.3, %add103.3
+  %xor.i.3 = xor i32 %add.i.3, %mul.i.3
+  %shr.i169.3 = lshr i32 %add105.3, 15
+  %and.i170.3 = and i32 %shr.i169.3, 65537
+  %mul.i171.3 = mul nuw i32 %and.i170.3, 65535
+  %add.i172.3 = add i32 %mul.i171.3, %add105.3
+  %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3
+  %shr.i174.3 = lshr i32 %sub104.3, 15
+  %and.i175.3 = and i32 %shr.i174.3, 65537
+  %mul.i176.3 = mul nuw i32 %and.i175.3, 65535
+  %add.i177.3 = add i32 %mul.i176.3, %sub104.3
+  %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3
+  %shr.i179.3 = lshr i32 %sub106.3, 15
+  %and.i180.3 = and i32 %shr.i179.3, 65537
+  %mul.i181.3 = mul nuw i32 %and.i180.3, 65535
+  %add.i182.3 = add i32 %mul.i181.3, %sub106.3
+  %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3
+  %add108.3 = add i32 %xor.i173.3, %add113.2
+  %add110.3 = add i32 %add108.3, %xor.i.3
+  %add112.3 = add i32 %add110.3, %xor.i178.3
+  %add113.3 = add i32 %add112.3, %xor.i183.3
+  %conv118 = and i32 %add113.3, 65535
+  %shr = lshr i32 %add113.3, 16
+  %add119 = add nuw nsw i32 %conv118, %shr
+  %shr120 = lshr i32 %add119, 1
+  ret i32 %shr120
+}

>From bfcc44c043d28db5ac6b4d6af5d7aaa157bc8431 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:21:30 -0700
Subject: [PATCH 02/12] updated basic-strided-loads.ll

---
 .../RISCV/basic-strided-loads.ll              | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc49269f0..0135d3c01d9f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -630,25 +630,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-LABEL: define void @rt_stride_widen_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0
-; CHECK-NEXT:    [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2
-; CHECK-NEXT:    [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
 ; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
 ; CHECK-NEXT:    ret void
 ;

>From e6d985658460ae14b178271edd65db740cc58929 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:41:51 -0700
Subject: [PATCH 03/12] did a "todo".

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fbdb77a813866..5ba6c8cd62995 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13222,12 +13222,10 @@ void BoUpSLP::transformNodes() {
         if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
-          // TODO: StrideTy =
-          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                ->getPointerOperand()
+                                                ->getType());
           StridedPtrInfo SPtrInfo;
-          Type *StrideTy = Type::getIntNTy(
-              SE->getContext(),
-              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
           SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
           SPtrInfo.Ty = VecTy;
           TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;

>From b4776d51e8d4a088ac4f944a33259c2157867425 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 12:22:28 -0700
Subject: [PATCH 04/12] addressed review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5ba6c8cd62995..1421ab47b071d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1931,7 +1931,7 @@ class BoUpSLP {
     // strided load.
     FixedVectorType *Ty = nullptr;
   };
-  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+  SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
 
 public:
   /// Tracks the state we can represent the loads in the given sequence.
@@ -2233,12 +2233,12 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo *SPtrInfo) const;
+                                StridedPtrInfo &SPtrInfo) const;
 
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -4502,7 +4502,7 @@ class BoUpSLP {
   TreeEntry::EntryState getScalarsVectorizationState(
       const InstructionsState &S, ArrayRef<Value *> VL,
       bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6476,7 +6476,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo *SPtrInfo) const {
+                                       StridedPtrInfo &SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6486,15 +6486,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    if (!Add)
+      return false;
     int64_t Offset = 0;
-    if (Add) {
-      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
-        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
-        if (!SC)
-          continue;
-        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
-        break;
-      }
+    for (int I : seq<int>(Add->getNumOperands())) {
+      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+      if (!SC)
+        continue;
+      Offset = SC->getAPInt().getSExtValue();
+      break;
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
@@ -6517,17 +6517,17 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
     return false;
 
-  SmallVector<int> SortedOffsetsV;
+  SmallVector<int64_t> SortedOffsetsV;
   for (auto [K, V] : OffsetToPointerOpIdxMap) {
     SortedOffsetsV.push_back(K);
   }
-  llvm::sort(SortedOffsetsV);
+  sort(SortedOffsetsV);
   if (NumOffsets > 1) {
-    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
-    if ((CommonDiff) != 1)
+    int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if (CommonDiff != 1)
       return false;
-    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
-      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+    for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
+      if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
         return false;
     }
   }
@@ -6547,31 +6547,29 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
   unsigned NumCoeffs0 = Coeffs0.size();
   if (NumCoeffs0 * NumOffsets != Sz)
     return false;
-  llvm::sort(Coeffs0);
+  sort(Coeffs0);
 
   SmallVector<unsigned> SortedIndicesDraft;
   SortedIndicesDraft.resize(Sz);
-  auto updateSortedIndices =
+  auto UpdateSortedIndices =
       [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
           SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
           int64_t OffsetNum) {
-        unsigned Num = 0;
-        for (unsigned Idx : SortedIndicesForOffset) {
+        for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
           SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
               IndicesInAllPointerOps[Idx];
-          ++Num;
         }
       };
 
-  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+  UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
 
   SmallVector<int64_t> Coeffs;
   SmallVector<unsigned> SortedIndicesForOffset;
-  for (int i = 1; i < NumOffsets; ++i) {
+  for (int I : seq<int>(1, NumOffsets)) {
     Coeffs.clear();
     SortedIndicesForOffset.clear();
 
-    int64_t Offset = SortedOffsetsV[i];
+    int64_t Offset = SortedOffsetsV[I];
     SmallVector<Value *> &PointerOpsForOffset =
         OffsetToPointerOpIdxMap[Offset].first;
     SmallVector<unsigned> &IndicesInAllPointerOps =
@@ -6579,26 +6577,23 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
       return false;
-    }
     if (Coeffs.size() != NumCoeffs0)
       return false;
-    llvm::sort(Coeffs);
-    for (unsigned i = 0; i < NumCoeffs0; ++i) {
-      if (Coeffs[i] != Coeffs0[i])
+    sort(Coeffs);
+    for (int J : seq<int>(0, NumCoeffs0)) {
+      if (Coeffs[J] != Coeffs0[J])
         return false;
     }
 
-    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  if (SPtrInfo) {
-    SPtrInfo->StrideSCEV = Stride0;
-    SPtrInfo->Ty = StridedLoadTy;
-  }
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
   return true;
 }
 
@@ -6932,20 +6927,21 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned i = 0; i < Sz; ++i) {
+  for (unsigned I : seq<int>(0, Sz)) {
     Value *Ptr =
-        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
-    SortedOffsetsFromBase[i] =
+        SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+    SortedOffsetsFromBase[I] =
         *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
   }
 
   // Find where the first group ends.
-  assert(SortedOffsetsFromBase.size() > 1);
+  assert(SortedOffsetsFromBase.size() > 1 &&
+         "Trying to generate strided load for less than 2 loads");
   int64_t StrideWithinGroup =
       SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
   unsigned GroupSize = 1;
@@ -6992,7 +6988,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     if (CurrentGroupStartIdx != Sz)
       return false;
 
-    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
                           int64_t StrideWithinGroup) -> bool {
       unsigned GroupEndIdx = StartIdx + 1;
       for (; GroupEndIdx != Sz; ++GroupEndIdx) {
@@ -7001,10 +6997,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
             StrideWithinGroup)
           break;
       }
-      return (GroupEndIdx - StartIdx == GroupSize0);
+      return GroupEndIdx - StartIdx == GroupSize0;
     };
-    for (unsigned i = 0; i < Sz; i += GroupSize) {
-      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+    for (unsigned I = 0; I < Sz; I += GroupSize) {
+      if (!CheckGroup(I, GroupSize, StrideWithinGroup))
         return false;
     }
   }
@@ -7042,11 +7038,9 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
          has_single_bit(AbsoluteDiff))) &&
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
-    if (SPtrInfo) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-      SPtrInfo->Ty = StridedLoadTy;
-    }
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+    SPtrInfo.Ty = StridedLoadTy;
     return true;
   }
   return false;
@@ -7094,7 +7088,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 SPtrInfo))
+                                 *SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7129,7 +7123,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10000,7 +9994,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -10102,7 +10096,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, &SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11563,7 +11557,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
   SmallVector<Value *> PointerOps;
   StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -19729,16 +19723,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
 
-        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
         StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy);
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         unsigned StridedLoadEC =
             StridedLoadTy->getElementCount().getKnownMinValue();
 
         Value *Stride = SPtrInfo.StrideVal;
         if (!Stride) {
           const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
-          assert(StrideSCEV);
+          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
           Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
                                           &*Builder.GetInsertPoint());

>From 32d3ecd67e26eca6c41d83c48976c43b67aeafc2 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:26:58 -0700
Subject: [PATCH 05/12] address review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1421ab47b071d..24a86bc29349d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2230,11 +2230,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  // Suppose we are given pointers of the form: %b + x * %s + y * %c
+  // where %c is constant. Check if the pointers can be rearranged as follows:
+  //  %b + 0 * %s + 0
+  //  %b + 0 * %s + 1
+  //  %b + 0 * %s + 2
+  //  ...
+  //  %b + 0 * %s + w
+  //
+  //  %b + 1 * %s + 0
+  //  %b + 1 * %s + 1
+  //  %b + 1 * %s + 2
+  //  ...
+  //  %b + 1 * %s + w
+  //  ...
+  //
+  //  If the pointers can be rearanged in the above pattern, it means that the
+  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo &SPtrInfo) const;
 
+  // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6518,9 +6536,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     return false;
 
   SmallVector<int64_t> SortedOffsetsV;
-  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+  for (auto [K, V] : OffsetToPointerOpIdxMap)
     SortedOffsetsV.push_back(K);
-  }
   sort(SortedOffsetsV);
   if (NumOffsets > 1) {
     int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
@@ -6577,17 +6594,16 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
+    if (!StrideWithinGroup || StrideWithinGroup != Stride0)
       return false;
     if (Coeffs.size() != NumCoeffs0)
       return false;
     sort(Coeffs);
-    for (int J : seq<int>(0, NumCoeffs0)) {
-      if (Coeffs[J] != Coeffs0[J])
-        return false;
-    }
+    if (Coeffs != Coeffs0)
+      return false
 
-    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
+          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
+                              I);
   }
 
   SortedIndices.clear();
@@ -6932,7 +6948,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned I : seq<int>(0, Sz)) {
+  for (unsigned I : seq<int>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
     SortedOffsetsFromBase[I] =

>From 9cd9f672c05cedf6fb8279a8e7bd22ffb2193ee2 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:56:01 -0700
Subject: [PATCH 06/12] added cost estimation

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 24a86bc29349d..c1a10c8eaf878 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6600,7 +6600,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
     sort(Coeffs);
     if (Coeffs != Coeffs0)
-      return false
+      return false;
 
           UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
                               I);
@@ -15024,11 +15024,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         break;
       case TreeEntry::StridedVectorize: {
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+                                    getCastContextHint(*E), CostKind);
+
         break;
       }
       case TreeEntry::CompressVectorize: {

>From a66d57d03aa19b6005d21777402ca3cbbb052d85 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 09:13:25 -0700
Subject: [PATCH 07/12] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c1a10c8eaf878..2b890da0c9a6d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6602,8 +6602,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     if (Coeffs != Coeffs0)
       return false;
 
-          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
-                              I);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();

>From 1d44861e44aa713981a2b70e4328cfda6cebe223 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:43:09 -0700
Subject: [PATCH 08/12] change references back to pointers because
 canVectorizeLoads can be called with nullptr

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2b890da0c9a6d..e932bea681999 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2250,13 +2250,13 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo &SPtrInfo) const;
+                                StridedPtrInfo *SPtrInfo) const;
 
   // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -6494,7 +6494,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo &SPtrInfo) const {
+                                       StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6607,8 +6607,10 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  SPtrInfo.StrideSCEV = Stride0;
-  SPtrInfo.Ty = StridedLoadTy;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
   return true;
 }
 
@@ -6942,7 +6944,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
@@ -7054,8 +7056,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
     Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-    SPtrInfo.Ty = StridedLoadTy;
+    if (SPtrInfo) {
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
     return true;
   }
   return false;
@@ -7103,7 +7107,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 *SPtrInfo))
+                                 SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7138,7 +7142,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

>From 59aae0bc9a21f223409bab0bcf2a38a4bd48fa7d Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:52:28 -0700
Subject: [PATCH 09/12] three slashes for comments, deleted wrong comment.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 99 ++++++++++---------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e932bea681999..3513fd7b37c4e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,17 +1918,15 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
-  // If we decide to generate strided load / store, this struct contains all the
-  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
-  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
-  // SCEV or as a Value if it already exists.
-  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
-  // has to by multiplied by the size of element of FixedVectorType.
+  /// If we decide to generate strided load / store, this struct contains all
+  /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+  /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+  /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+  /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+  /// size of element of FixedVectorType.
   struct StridedPtrInfo {
     Value *StrideVal = nullptr;
     const SCEV *StrideSCEV = nullptr;
-    // Represents the ammount which needs to be added to the base pointer of
-    // strided load.
     FixedVectorType *Ty = nullptr;
   };
   SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
@@ -2230,29 +2228,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
-  // Suppose we are given pointers of the form: %b + x * %s + y * %c
-  // where %c is constant. Check if the pointers can be rearranged as follows:
-  //  %b + 0 * %s + 0
-  //  %b + 0 * %s + 1
-  //  %b + 0 * %s + 2
-  //  ...
-  //  %b + 0 * %s + w
-  //
-  //  %b + 1 * %s + 0
-  //  %b + 1 * %s + 1
-  //  %b + 1 * %s + 2
-  //  ...
-  //  %b + 1 * %s + w
-  //  ...
-  //
-  //  If the pointers can be rearanged in the above pattern, it means that the
-  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
+  /// Suppose we are given pointers of the form: %b + x * %s + y * %c
+  /// where %c is constant. Check if the pointers can be rearranged as follows:
+  ///  %b + 0 * %s + 0
+  ///  %b + 0 * %s + 1
+  ///  %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + w
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + w
+  ///  ...
+  ///
+  ///  If the pointers can be rearanged in the above pattern, it means that the
+  ///  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo *SPtrInfo) const;
 
-  // Same as analyzeRtStrideCandidate, but for constant strides.
+  /// Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6474,23 +6472,23 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   return Stride;
 }
 
-// Suppose we are given pointers of the form: %b + x * %s + y * %c
-// where %c is constant. Check if the pointers can be rearranged as follows:
-//  %b + 0 * %s + 0
-//  %b + 0 * %s + 1
-//  %b + 0 * %s + 2
-//  ...
-//  %b + 0 * %s + w
-//
-//  %b + 1 * %s + 0
-//  %b + 1 * %s + 1
-//  %b + 1 * %s + 2
-//  ...
-//  %b + 1 * %s + w
-//  ...
-//
-//  If the pointers can be rearanged in the above pattern, it means that the
-//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+/// Suppose we are given pointers of the form: %b + x * %s + y * %c
+/// where %c is constant. Check if the pointers can be rearranged as follows:
+///  %b + 0 * %s + 0
+///  %b + 0 * %s + 1
+///  %b + 0 * %s + 2
+///  ...
+///  %b + 0 * %s + w
+///
+///  %b + 1 * %s + 0
+///  %b + 1 * %s + 1
+///  %b + 1 * %s + 2
+///  ...
+///  %b + 1 * %s + w
+///  ...
+///
+///  If the pointers can be rearanged in the above pattern, it means that the
+///  memory can be accessed with a strided loads of width `w` and stride `%s`.
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
@@ -6941,7 +6939,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-// Same as analyzeRtStrideCandidate, but for constant strides.
+/// Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
     SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
@@ -15033,12 +15031,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        if (StridedLoadTy != VecTy)
-          VecLdCost +=
-              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-                                    getCastContextHint(*E), CostKind);
+        //VecLdCost = TTI->getStridedMemoryOpCost(
+        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+        //    /*VariableMask=*/false, CommonAlignment, CostKind);
+        //if (StridedLoadTy != VecTy)
+        //  VecLdCost +=
+        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+        //                            getCastContextHint(*E), CostKind);
 
         break;
       }

>From 5ae2c60d8269a2446ed1f603aaa7980380c40fbd Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:10:11 -0700
Subject: [PATCH 10/12] revert a change made by mistake.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3513fd7b37c4e..687cd9d7d27d5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6502,15 +6502,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
-    if (!Add)
-      return false;
     int64_t Offset = 0;
-    for (int I : seq<int>(Add->getNumOperands())) {
-      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
-      if (!SC)
-        continue;
-      Offset = SC->getAPInt().getSExtValue();
-      break;
+    if (Add) {
+      for (int I : seq<int>(Add->getNumOperands())) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        if (!SC)
+          continue;
+        Offset = SC->getAPInt().getSExtValue();
+        break;
+      }
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);

>From 0e48b3f9210598366b04307dd9e358b626d058aa Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:13:57 -0700
Subject: [PATCH 11/12] Another DenseMap -> SmallDenseMap

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 687cd9d7d27d5..2371febdb936d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6494,7 +6494,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        SmallVectorImpl<unsigned> &SortedIndices,
                                        StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
-  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+  SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
   for (auto [Idx, Ptr] : enumerate(PointerOps)) {
     const SCEV *PtrSCEV = SE->getSCEV(Ptr);

>From cef24ae328846a76dd60e61a328697f7058f24b4 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:21:53 -0700
Subject: [PATCH 12/12] put cost estimation back and make the order of type
 operands correct.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2371febdb936d..fb00abe11a522 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15031,15 +15031,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        //VecLdCost = TTI->getStridedMemoryOpCost(
-        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
-        //    /*VariableMask=*/false, CommonAlignment, CostKind);
-        //if (StridedLoadTy != VecTy)
-        //  VecLdCost +=
-        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-        //                            getCastContextHint(*E), CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+                                    getCastContextHint(*E), CostKind);
 
         break;
       }